contrib/llvm/lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86ShuffleDecodeConstantPool.h"
  22 #include "X86TargetMachine.h"
  23 #include "X86TargetObjectFile.h"
  24 #include "llvm/ADT/SmallBitVector.h"
  25 #include "llvm/ADT/SmallSet.h"
  26 #include "llvm/ADT/Statistic.h"
  27 #include "llvm/ADT/StringExtras.h"
  28 #include "llvm/ADT/StringSwitch.h"
  29 #include "llvm/Analysis/EHPersonalities.h"
  30 #include "llvm/CodeGen/IntrinsicLowering.h"
  31 #include "llvm/CodeGen/MachineFrameInfo.h"
  32 #include "llvm/CodeGen/MachineFunction.h"
  33 #include "llvm/CodeGen/MachineInstrBuilder.h"
  34 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  35 #include "llvm/CodeGen/MachineModuleInfo.h"
  36 #include "llvm/CodeGen/MachineRegisterInfo.h"
  37 #include "llvm/CodeGen/WinEHFuncInfo.h"
  38 #include "llvm/IR/CallSite.h"
  39 #include "llvm/IR/CallingConv.h"
  40 #include "llvm/IR/Constants.h"
  41 #include "llvm/IR/DerivedTypes.h"
  42 #include "llvm/IR/Function.h"
  43 #include "llvm/IR/GlobalAlias.h"
  44 #include "llvm/IR/GlobalVariable.h"
  45 #include "llvm/IR/Instructions.h"
  46 #include "llvm/IR/Intrinsics.h"
  47 #include "llvm/MC/MCAsmInfo.h"
  48 #include "llvm/MC/MCContext.h"
  49 #include "llvm/MC/MCExpr.h"
  50 #include "llvm/MC/MCSymbol.h"
  51 #include "llvm/Support/CommandLine.h"
  52 #include "llvm/Support/Debug.h"
  53 #include "llvm/Support/ErrorHandling.h"
  54 #include "llvm/Support/MathExtras.h"
  55 #include "llvm/Target/TargetOptions.h"
  56 #include "X86IntrinsicsInfo.h"
  57 #include <bitset>
  58 #include <numeric>
  59 #include <cctype>
  60 using namespace llvm;
  61
  62 #define DEBUG_TYPE "x86-isel"
  63
  64 STATISTIC(NumTailCalls, "Number of tail calls");
  65
  66 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  67     "x86-experimental-vector-widening-legalization", cl::init(false),
  68     cl::desc("Enable an experimental vector type legalization through widening "
  69              "rather than promotion."),
  70     cl::Hidden);
  71
  72 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
  73                                      const X86Subtarget &STI)
  74     : TargetLowering(TM), Subtarget(STI) {
  75   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
  76   X86ScalarSSEf64 = Subtarget.hasSSE2();
  77   X86ScalarSSEf32 = Subtarget.hasSSE1();
  78   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
  79
  80   // Set up the TargetLowering object.
  81
  82   // X86 is weird. It always uses i8 for shift amounts and setcc results.
  83   setBooleanContents(ZeroOrOneBooleanContent);
  84   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
  85   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  86
  87   // For 64-bit, since we have so many registers, use the ILP scheduler.
  88   // For 32-bit, use the register pressure specific scheduling.
  89   // For Atom, always use ILP scheduling.
  90   if (Subtarget.isAtom())
  91     setSchedulingPreference(Sched::ILP);
  92   else if (Subtarget.is64Bit())
  93     setSchedulingPreference(Sched::ILP);
  94   else
  95     setSchedulingPreference(Sched::RegPressure);
  96   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
  97   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
  98
  99   // Bypass expensive divides on Atom when compiling with O2.
 100   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 101     if (Subtarget.hasSlowDivide32())
 102       addBypassSlowDiv(32, 8);
 103     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
 104       addBypassSlowDiv(64, 16);
 105   }
 106
 107   if (Subtarget.isTargetKnownWindowsMSVC()) {
 108     // Setup Windows compiler runtime calls.
 109     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 110     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 111     setLibcallName(RTLIB::SREM_I64, "_allrem");
 112     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 113     setLibcallName(RTLIB::MUL_I64, "_allmul");
 114     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 115     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 116     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 117     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 118     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 119   }
 120
 121   if (Subtarget.isTargetDarwin()) {
 122     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 123     setUseUnderscoreSetJmp(false);
 124     setUseUnderscoreLongJmp(false);
 125   } else if (Subtarget.isTargetWindowsGNU()) {
 126     // MS runtime is weird: it exports _setjmp, but longjmp!
 127     setUseUnderscoreSetJmp(true);
 128     setUseUnderscoreLongJmp(false);
 129   } else {
 130     setUseUnderscoreSetJmp(true);
 131     setUseUnderscoreLongJmp(true);
 132   }
 133
 134   // Set up the register classes.
 135   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 136   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 137   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 138   if (Subtarget.is64Bit())
 139     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 140
 141   for (MVT VT : MVT::integer_valuetypes())
 142     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 143
 144   // We don't accept any truncstore of integer registers.
 145   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 146   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 147   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 148   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 149   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 150   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 151
 152   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 153
 154   // SETOEQ and SETUNE require checking two conditions.
 155   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 156   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 157   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 158   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 159   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 160   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 161
 162   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 163   // operation.
 164   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 165   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 166   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 167
 168   if (Subtarget.is64Bit()) {
 169     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
 170       // f32/f64 are legal, f80 is custom.
 171       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
 172     else
 173       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
 174     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 175   } else if (!Subtarget.useSoftFloat()) {
 176     // We have an algorithm for SSE2->double, and we turn this into a
 177     // 64-bit FILD followed by conditional FADD for other targets.
 178     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 179     // We have an algorithm for SSE2, and we turn this into a 64-bit
 180     // FILD or VCVTUSI2SS/SD for other targets.
 181     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 182   }
 183
 184   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 185   // this operation.
 186   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 187   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 188
 189   if (!Subtarget.useSoftFloat()) {
 190     // SSE has no i16 to fp conversion, only i32.
 191     if (X86ScalarSSEf32) {
 192       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 193       // f32 and f64 cases are Legal, f80 case is not
 194       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 195     } else {
 196       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 197       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 198     }
 199   } else {
 200     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 201     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 202   }
 203
 204   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 205   // this operation.
 206   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 207   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 208
 209   if (!Subtarget.useSoftFloat()) {
 210     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 211     // are Legal, f80 is custom lowered.
 212     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 213     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 214
 215     if (X86ScalarSSEf32) {
 216       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 217       // f32 and f64 cases are Legal, f80 case is not
 218       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 219     } else {
 220       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 221       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 222     }
 223   } else {
 224     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 225     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
 226     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
 227   }
 228
 229   // Handle FP_TO_UINT by promoting the destination to a larger signed
 230   // conversion.
 231   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 232   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 233   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 234
 235   if (Subtarget.is64Bit()) {
 236     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
 237       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
 238       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 239       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
 240     } else {
 241       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
 242       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
 243     }
 244   } else if (!Subtarget.useSoftFloat()) {
 245     // Since AVX is a superset of SSE3, only check for SSE here.
 246     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
 247       // Expand FP_TO_UINT into a select.
 248       // FIXME: We would like to use a Custom expander here eventually to do
 249       // the optimal thing for SSE vs. the default expansion in the legalizer.
 250       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 251     else
 252       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
 253       // With SSE3 we can use fisttpll to convert to a signed i64; without
 254       // SSE, we're stuck with a fistpll.
 255       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 256
 257     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 258   }
 259
 260   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 261   if (!X86ScalarSSEf64) {
 262     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 263     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 264     if (Subtarget.is64Bit()) {
 265       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 266       // Without SSE, i64->f64 goes through memory.
 267       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 268     }
 269   } else if (!Subtarget.is64Bit())
 270     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 271
 272   // Scalar integer divide and remainder are lowered to use operations that
 273   // produce two results, to match the available instructions. This exposes
 274   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 275   // into a single instruction.
 276   //
 277   // Scalar integer multiply-high is also lowered to use two-result
 278   // operations, to match the available instructions. However, plain multiply
 279   // (low) operations are left as Legal, as there are single-result
 280   // instructions for this in x86. Using the two-result multiply instructions
 281   // when both high and low results are needed must be arranged by dagcombine.
 282   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 283     setOperationAction(ISD::MULHS, VT, Expand);
 284     setOperationAction(ISD::MULHU, VT, Expand);
 285     setOperationAction(ISD::SDIV, VT, Expand);
 286     setOperationAction(ISD::UDIV, VT, Expand);
 287     setOperationAction(ISD::SREM, VT, Expand);
 288     setOperationAction(ISD::UREM, VT, Expand);
 289
 290     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 291     setOperationAction(ISD::ADDC, VT, Custom);
 292     setOperationAction(ISD::ADDE, VT, Custom);
 293     setOperationAction(ISD::SUBC, VT, Custom);
 294     setOperationAction(ISD::SUBE, VT, Custom);
 295   }
 296
 297   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 298   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 299   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
 300                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
 301     setOperationAction(ISD::BR_CC,     VT, Expand);
 302     setOperationAction(ISD::SELECT_CC, VT, Expand);
 303   }
 304   if (Subtarget.is64Bit())
 305     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 306   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 307   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 308   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 309   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 310
 311   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 312   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 313   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 314   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 315
 316   // Promote the i8 variants and force them on up to i32 which has a shorter
 317   // encoding.
 318   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
 319   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 320   if (!Subtarget.hasBMI()) {
 321     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 322     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 323     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
 324     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
 325     if (Subtarget.is64Bit()) {
 326       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 327       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
 328     }
 329   }
 330
 331   if (Subtarget.hasLZCNT()) {
 332     // When promoting the i8 variants, force them to i32 for a shorter
 333     // encoding.
 334     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
 335     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 336   } else {
 337     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 338     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 339     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 340     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 341     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 342     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 343     if (Subtarget.is64Bit()) {
 344       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 345       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 346     }
 347   }
 348
 349   // Special handling for half-precision floating point conversions.
 350   // If we don't have F16C support, then lower half float conversions
 351   // into library calls.
 352   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
 353     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 354     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 355   }
 356
 357   // There's never any support for operations beyond MVT::f32.
 358   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 359   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 360   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 361   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 362
 363   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 364   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 365   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 366   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 367   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 368   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 369
 370   if (Subtarget.hasPOPCNT()) {
 371     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 372   } else {
 373     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 374     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 375     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 376     if (Subtarget.is64Bit())
 377       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 378   }
 379
 380   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 381
 382   if (!Subtarget.hasMOVBE())
 383     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 384
 385   // These should be promoted to a larger select which is supported.
 386   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 387   // X86 wants to expand cmov itself.
 388   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
 389     setOperationAction(ISD::SELECT, VT, Custom);
 390     setOperationAction(ISD::SETCC, VT, Custom);
 391   }
 392   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 393     if (VT == MVT::i64 && !Subtarget.is64Bit())
 394       continue;
 395     setOperationAction(ISD::SELECT, VT, Custom);
 396     setOperationAction(ISD::SETCC,  VT, Custom);
 397     setOperationAction(ISD::SETCCE, VT, Custom);
 398   }
 399   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 400   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 401   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 402   // support continuation, user-level threading, and etc.. As a result, no
 403   // other SjLj exception interfaces are implemented and please don't build
 404   // your own exception handling based on them.
 405   // LLVM/Clang supports zero-cost DWARF exception handling.
 406   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 407   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 408   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 409   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
 410     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 411
 412   // Darwin ABI issue.
 413   for (auto VT : { MVT::i32, MVT::i64 }) {
 414     if (VT == MVT::i64 && !Subtarget.is64Bit())
 415       continue;
 416     setOperationAction(ISD::ConstantPool    , VT, Custom);
 417     setOperationAction(ISD::JumpTable       , VT, Custom);
 418     setOperationAction(ISD::GlobalAddress   , VT, Custom);
 419     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
 420     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
 421     setOperationAction(ISD::BlockAddress    , VT, Custom);
 422   }
 423   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 424   for (auto VT : { MVT::i32, MVT::i64 }) {
 425     if (VT == MVT::i64 && !Subtarget.is64Bit())
 426       continue;
 427     setOperationAction(ISD::SHL_PARTS, VT, Custom);
 428     setOperationAction(ISD::SRA_PARTS, VT, Custom);
 429     setOperationAction(ISD::SRL_PARTS, VT, Custom);
 430   }
 431
 432   if (Subtarget.hasSSE1())
 433     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 434
 435   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 436
 437   // Expand certain atomics
 438   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 439     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 440     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 441     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
 442     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
 443     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
 444     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
 445     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 446   }
 447
 448   if (Subtarget.hasCmpxchg16b()) {
 449     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 450   }
 451
 452   // FIXME - use subtarget debug flags
 453   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
 454       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
 455       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
 456     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 457   }
 458
 459   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 460   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 461
 462   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 463   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 464
 465   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 466   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 467
 468   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 469   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 470   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 471   bool Is64Bit = Subtarget.is64Bit();
 472   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
 473   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 474
 475   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 476   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 477
 478   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 479
 480   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
 481   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
 482   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 483
 484   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
 485     // f32 and f64 use SSE.
 486     // Set up the FP register classes.
 487     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 488     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 489
 490     for (auto VT : { MVT::f32, MVT::f64 }) {
 491       // Use ANDPD to simulate FABS.
 492       setOperationAction(ISD::FABS, VT, Custom);
 493
 494       // Use XORP to simulate FNEG.
 495       setOperationAction(ISD::FNEG, VT, Custom);
 496
 497       // Use ANDPD and ORPD to simulate FCOPYSIGN.
 498       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 499
 500       // We don't support sin/cos/fmod
 501       setOperationAction(ISD::FSIN   , VT, Expand);
 502       setOperationAction(ISD::FCOS   , VT, Expand);
 503       setOperationAction(ISD::FSINCOS, VT, Expand);
 504     }
 505
 506     // Lower this to MOVMSK plus an AND.
 507     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 508     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 509
 510     // Expand FP immediates into loads from the stack, except for the special
 511     // cases we handle.
 512     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 513     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 514   } else if (UseX87 && X86ScalarSSEf32) {
 515     // Use SSE for f32, x87 for f64.
 516     // Set up the FP register classes.
 517     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 518     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 519
 520     // Use ANDPS to simulate FABS.
 521     setOperationAction(ISD::FABS , MVT::f32, Custom);
 522
 523     // Use XORP to simulate FNEG.
 524     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 525
 526     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 527
 528     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 529     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 530     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 531
 532     // We don't support sin/cos/fmod
 533     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 534     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 535     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 536
 537     // Special cases we handle for FP constants.
 538     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 539     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 540     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 541     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 542     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 543
 544     if (!TM.Options.UnsafeFPMath) {
 545       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 546       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 547       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 548     }
 549   } else if (UseX87) {
 550     // f32 and f64 in x87.
 551     // Set up the FP register classes.
 552     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 553     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 554
 555     for (auto VT : { MVT::f32, MVT::f64 }) {
 556       setOperationAction(ISD::UNDEF,     VT, Expand);
 557       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 558
 559       if (!TM.Options.UnsafeFPMath) {
 560         setOperationAction(ISD::FSIN   , VT, Expand);
 561         setOperationAction(ISD::FCOS   , VT, Expand);
 562         setOperationAction(ISD::FSINCOS, VT, Expand);
 563       }
 564     }
 565     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 566     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 567     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 568     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 569     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 570     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 571     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 572     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 573   }
 574
 575   // We don't support FMA.
 576   setOperationAction(ISD::FMA, MVT::f64, Expand);
 577   setOperationAction(ISD::FMA, MVT::f32, Expand);
 578
 579   // Long double always uses X87, except f128 in MMX.
 580   if (UseX87) {
 581     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
 582       addRegisterClass(MVT::f128, &X86::FR128RegClass);
 583       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
 584       setOperationAction(ISD::FABS , MVT::f128, Custom);
 585       setOperationAction(ISD::FNEG , MVT::f128, Custom);
 586       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
 587     }
 588
 589     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 590     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 591     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 592     {
 593       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 594       addLegalFPImmediate(TmpFlt);  // FLD0
 595       TmpFlt.changeSign();
 596       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 597
 598       bool ignored;
 599       APFloat TmpFlt2(+1.0);
 600       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 601                       &ignored);
 602       addLegalFPImmediate(TmpFlt2);  // FLD1
 603       TmpFlt2.changeSign();
 604       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 605     }
 606
 607     if (!TM.Options.UnsafeFPMath) {
 608       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 609       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 610       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 611     }
 612
 613     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 614     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 615     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 616     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 617     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 618     setOperationAction(ISD::FMA, MVT::f80, Expand);
 619   }
 620
 621   // Always use a library call for pow.
 622   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 623   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 624   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 625
 626   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 627   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 628   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 629   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 630   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 631   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 632   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 633
 634   // Some FP actions are always expanded for vector types.
 635   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
 636                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
 637     setOperationAction(ISD::FSIN,      VT, Expand);
 638     setOperationAction(ISD::FSINCOS,   VT, Expand);
 639     setOperationAction(ISD::FCOS,      VT, Expand);
 640     setOperationAction(ISD::FREM,      VT, Expand);
 641     setOperationAction(ISD::FPOWI,     VT, Expand);
 642     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 643     setOperationAction(ISD::FPOW,      VT, Expand);
 644     setOperationAction(ISD::FLOG,      VT, Expand);
 645     setOperationAction(ISD::FLOG2,     VT, Expand);
 646     setOperationAction(ISD::FLOG10,    VT, Expand);
 647     setOperationAction(ISD::FEXP,      VT, Expand);
 648     setOperationAction(ISD::FEXP2,     VT, Expand);
 649   }
 650
 651   // First set operation action for all vector types to either promote
 652   // (for widening) or expand (for scalarization). Then we will selectively
 653   // turn on ones that can be effectively codegen'd.
 654   for (MVT VT : MVT::vector_valuetypes()) {
 655     setOperationAction(ISD::SDIV, VT, Expand);
 656     setOperationAction(ISD::UDIV, VT, Expand);
 657     setOperationAction(ISD::SREM, VT, Expand);
 658     setOperationAction(ISD::UREM, VT, Expand);
 659     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 660     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 661     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 662     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 663     setOperationAction(ISD::FMA,  VT, Expand);
 664     setOperationAction(ISD::FFLOOR, VT, Expand);
 665     setOperationAction(ISD::FCEIL, VT, Expand);
 666     setOperationAction(ISD::FTRUNC, VT, Expand);
 667     setOperationAction(ISD::FRINT, VT, Expand);
 668     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 669     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 670     setOperationAction(ISD::MULHS, VT, Expand);
 671     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 672     setOperationAction(ISD::MULHU, VT, Expand);
 673     setOperationAction(ISD::SDIVREM, VT, Expand);
 674     setOperationAction(ISD::UDIVREM, VT, Expand);
 675     setOperationAction(ISD::CTPOP, VT, Expand);
 676     setOperationAction(ISD::CTTZ, VT, Expand);
 677     setOperationAction(ISD::CTLZ, VT, Expand);
 678     setOperationAction(ISD::ROTL, VT, Expand);
 679     setOperationAction(ISD::ROTR, VT, Expand);
 680     setOperationAction(ISD::BSWAP, VT, Expand);
 681     setOperationAction(ISD::SETCC, VT, Expand);
 682     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 683     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 684     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 685     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 686     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 687     setOperationAction(ISD::TRUNCATE, VT, Expand);
 688     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 689     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 690     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 691     setOperationAction(ISD::SELECT_CC, VT, Expand);
 692     for (MVT InnerVT : MVT::vector_valuetypes()) {
 693       setTruncStoreAction(InnerVT, VT, Expand);
 694
 695       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 696       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 697
 698       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 699       // types, we have to deal with them whether we ask for Expansion or not.
 700       // Setting Expand causes its own optimisation problems though, so leave
 701       // them legal.
 702       if (VT.getVectorElementType() == MVT::i1)
 703         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 704
 705       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
 706       // split/scalarized right now.
 707       if (VT.getVectorElementType() == MVT::f16)
 708         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 709     }
 710   }
 711
 712   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 713   // with -msoft-float, disable use of MMX as well.
 714   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
 715     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 716     // No operations on x86mmx supported, everything uses intrinsics.
 717   }
 718
 719   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
 720     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 721
 722     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 723     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 724     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 725     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 726     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
 727     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 728     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 729     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 730   }
 731
 732   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
 733     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 734
 735     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 736     // registers cannot be used even for integer operations.
 737     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 738     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 739     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 740     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 741
 742     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
 743     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 744     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 745     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 746     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 747     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
 748     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
 749     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 750     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 751     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 752     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 753     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 754
 755     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
 756     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
 757     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
 758     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
 759
 760     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 761     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 762     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 763     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 764
 765     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 766     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 767     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 768     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 769     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 770
 771     setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
 772     setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
 773     setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
 774     setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
 775
 776     setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
 777     setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
 778     setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
 779     // ISD::CTTZ v2i64 - scalarization is faster.
 780
 781     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 782     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
 783       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 784       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 785       setOperationAction(ISD::VSELECT,            VT, Custom);
 786       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 787     }
 788
 789     // We support custom legalizing of sext and anyext loads for specific
 790     // memory vector types which we can load as a scalar (or sequence of
 791     // scalars) and extend in-register to a legal 128-bit vector type. For sext
 792     // loads these must work with a single scalar load.
 793     for (MVT VT : MVT::integer_vector_valuetypes()) {
 794       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
 795       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
 796       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
 797       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
 798       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
 799       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
 800       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
 801       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
 802       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
 803     }
 804
 805     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
 806       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 807       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 808       setOperationAction(ISD::VSELECT,            VT, Custom);
 809
 810       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
 811         continue;
 812
 813       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
 814       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 815     }
 816
 817     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
 818     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
 819       setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
 820       setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
 821       setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
 822       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
 823       setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
 824     }
 825
 826     // Custom lower v2i64 and v2f64 selects.
 827     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
 828     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 829
 830     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
 831     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
 832
 833     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
 834
 835     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
 836     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
 837     // As there is no 64-bit GPR available, we need build a special custom
 838     // sequence to convert from v2i32 to v2f32.
 839     if (!Subtarget.is64Bit())
 840       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
 841
 842     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
 843     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 844
 845     for (MVT VT : MVT::fp_vector_valuetypes())
 846       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 847
 848     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
 849     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
 850     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
 851
 852     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
 853     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
 854     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 855
 856     for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
 857       setOperationAction(ISD::SRL, VT, Custom);
 858       setOperationAction(ISD::SHL, VT, Custom);
 859       setOperationAction(ISD::SRA, VT, Custom);
 860     }
 861
 862     // In the customized shift lowering, the legal cases in AVX2 will be
 863     // recognized.
 864     for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
 865       setOperationAction(ISD::SRL, VT, Custom);
 866       setOperationAction(ISD::SHL, VT, Custom);
 867       setOperationAction(ISD::SRA, VT, Custom);
 868     }
 869   }
 870
 871   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
 872     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
 873     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
 874     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
 875     // ISD::CTLZ v4i32 - scalarization is faster.
 876     // ISD::CTLZ v2i64 - scalarization is faster.
 877   }
 878
 879   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
 880     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
 881       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
 882       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
 883       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
 884       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
 885       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
 886     }
 887
 888     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
 889     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
 890     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
 891     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
 892     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
 893     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
 894     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
 895     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 896
 897     // FIXME: Do we need to handle scalar-to-vector here?
 898     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 899
 900     // We directly match byte blends in the backend as they match the VSELECT
 901     // condition form.
 902     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 903
 904     // SSE41 brings specific instructions for doing vector sign extend even in
 905     // cases where we don't have SRA.
 906     for (MVT VT : MVT::integer_vector_valuetypes()) {
 907       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
 908       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
 909       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
 910     }
 911
 912     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
 913     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
 914     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
 915     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
 916     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
 917     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
 918     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
 919
 920     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
 921     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
 922     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
 923     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
 924     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
 925     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
 926
 927     // i8 vectors are custom because the source register and source
 928     // source memory operand types are not the same width.
 929     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
 930   }
 931
 932   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
 933     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
 934                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
 935       setOperationAction(ISD::ROTL, VT, Custom);
 936
 937     // XOP can efficiently perform BITREVERSE with VPPERM.
 938     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
 939       setOperationAction(ISD::BITREVERSE, VT, Custom);
 940
 941     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
 942                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
 943       setOperationAction(ISD::BITREVERSE, VT, Custom);
 944   }
 945
 946   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
 947     bool HasInt256 = Subtarget.hasInt256();
 948
 949     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
 950     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
 951     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
 952     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
 953     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
 954     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
 955
 956     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
 957       setOperationAction(ISD::FFLOOR,     VT, Legal);
 958       setOperationAction(ISD::FCEIL,      VT, Legal);
 959       setOperationAction(ISD::FTRUNC,     VT, Legal);
 960       setOperationAction(ISD::FRINT,      VT, Legal);
 961       setOperationAction(ISD::FNEARBYINT, VT, Legal);
 962       setOperationAction(ISD::FNEG,       VT, Custom);
 963       setOperationAction(ISD::FABS,       VT, Custom);
 964     }
 965
 966     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
 967     // even though v8i16 is a legal type.
 968     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
 969     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
 970     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
 971
 972     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
 973     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
 974     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 975
 976     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
 977     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
 978
 979     for (MVT VT : MVT::fp_vector_valuetypes())
 980       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
 981
 982     for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
 983       setOperationAction(ISD::SRL, VT, Custom);
 984       setOperationAction(ISD::SHL, VT, Custom);
 985       setOperationAction(ISD::SRA, VT, Custom);
 986     }
 987
 988     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
 989     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
 990     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
 991     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
 992
 993     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
 994     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
 995     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 996
 997     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
 998     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
 999     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1000     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1001     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1002     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1003     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1004     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1005     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1006     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1007     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1008     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1009     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1010
1011     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012       setOperationAction(ISD::CTPOP,           VT, Custom);
1013       setOperationAction(ISD::CTTZ,            VT, Custom);
1014     }
1015
1016     // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
1017     // as we end up splitting the 256-bit vectors.
1018     for (auto VT : { MVT::v32i8, MVT::v16i16 })
1019       setOperationAction(ISD::CTLZ,            VT, Custom);
1020
1021     if (HasInt256)
1022       for (auto VT : { MVT::v8i32, MVT::v4i64 })
1023         setOperationAction(ISD::CTLZ,          VT, Custom);
1024
1025     if (Subtarget.hasAnyFMA()) {
1026       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1027                        MVT::v2f64, MVT::v4f64 })
1028         setOperationAction(ISD::FMA, VT, Legal);
1029     }
1030
1031     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1032       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1033       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1034     }
1035
1036     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1037     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1038     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1039     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1040
1041     setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
1042     setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
1043
1044     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1045     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1046     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1047     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1048
1049     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1050       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1051       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1052       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1053       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1054     }
1055
1056     if (HasInt256) {
1057       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
1058       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
1059       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1060
1061       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1062       // when we have a 256bit-wide blend with immediate.
1063       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1064
1065       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1066       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1067       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1068       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1069       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1070       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1071       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1072
1073       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1074       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1075       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1076       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1077       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1078       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1079     }
1080
1081     // In the customized shift lowering, the legal cases in AVX2 will be
1082     // recognized.
1083     for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1084       setOperationAction(ISD::SRL, VT, Custom);
1085       setOperationAction(ISD::SHL, VT, Custom);
1086       setOperationAction(ISD::SRA, VT, Custom);
1087     }
1088
1089     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1090                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1091       setOperationAction(ISD::MLOAD,  VT, Legal);
1092       setOperationAction(ISD::MSTORE, VT, Legal);
1093     }
1094
1095     // Extract subvector is special because the value type
1096     // (result) is 128-bit but the source is 256-bit wide.
1097     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1098                      MVT::v4f32, MVT::v2f64 }) {
1099       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1100     }
1101
1102     // Custom lower several nodes for 256-bit types.
1103     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1104                     MVT::v8f32, MVT::v4f64 }) {
1105       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1106       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1107       setOperationAction(ISD::VSELECT,            VT, Custom);
1108       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1109       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1110       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1111       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1112       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1113     }
1114
1115     if (HasInt256)
1116       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1117
1118     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1119     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1120       setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
1121       setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
1122       setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
1123       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
1124       setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1125     }
1126   }
1127
1128   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1129     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1130     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1131     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1132     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1133
1134     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1135     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1136     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1137
1138     for (MVT VT : MVT::fp_vector_valuetypes())
1139       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1140
1141     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1142       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1143       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1144       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
1145       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1146       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1147       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1148     }
1149     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1150     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1151     setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
1152     setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
1153     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1154     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1155     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1156     setOperationAction(ISD::SUB,                MVT::i1,    Custom);
1157     setOperationAction(ISD::ADD,                MVT::i1,    Custom);
1158     setOperationAction(ISD::MUL,                MVT::i1,    Custom);
1159
1160     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1161                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1162                    MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1163       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1164       setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1165       setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1166       setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
1167       setTruncStoreAction(VT, MaskVT, Custom);
1168     }
1169
1170     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1171       setOperationAction(ISD::FNEG,  VT, Custom);
1172       setOperationAction(ISD::FABS,  VT, Custom);
1173       setOperationAction(ISD::FMA,   VT, Legal);
1174     }
1175
1176     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1177     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1178     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1179     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1180     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1181     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1182     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1183     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1184     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1185     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1186     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1187     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1188     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
1189     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
1190     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1191     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1192
1193     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1194     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1195     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1196     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1197     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1198     if (Subtarget.hasVLX()){
1199       setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1200       setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1201       setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1202       setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1203       setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1204
1205       setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1206       setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1207       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1208       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1209       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1210     } else {
1211       setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
1212       setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
1213       setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
1214       setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
1215     }
1216     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1217     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1218     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1219     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
1220     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
1221     setOperationAction(ISD::VSELECT,            MVT::v8i1,  Expand);
1222     setOperationAction(ISD::VSELECT,            MVT::v16i1, Expand);
1223     if (Subtarget.hasDQI()) {
1224       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
1225       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
1226       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
1227       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
1228       if (Subtarget.hasVLX()) {
1229         setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
1230         setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
1231         setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
1232         setOperationAction(ISD::UINT_TO_FP,    MVT::v2i64, Legal);
1233         setOperationAction(ISD::FP_TO_SINT,    MVT::v4i64, Legal);
1234         setOperationAction(ISD::FP_TO_SINT,    MVT::v2i64, Legal);
1235         setOperationAction(ISD::FP_TO_UINT,    MVT::v4i64, Legal);
1236         setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
1237       }
1238     }
1239     if (Subtarget.hasVLX()) {
1240       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
1241       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
1242       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
1243       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
1244       setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
1245       setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
1246       setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
1247       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
1248       setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
1249       setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
1250
1251       // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1252       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
1253       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1254       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1255       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1256       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
1257       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1258       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1259       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1260       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1261       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1262     }
1263
1264     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1265     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1266     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1267     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1268     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1269     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
1270     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
1271     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1272     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1273     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1274     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1275     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1276     if (Subtarget.hasDQI()) {
1277       setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
1278       setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
1279     }
1280     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1281       setOperationAction(ISD::FFLOOR,     VT, Legal);
1282       setOperationAction(ISD::FCEIL,      VT, Legal);
1283       setOperationAction(ISD::FTRUNC,     VT, Legal);
1284       setOperationAction(ISD::FRINT,      VT, Legal);
1285       setOperationAction(ISD::FNEARBYINT, VT, Legal);
1286     }
1287
1288     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1289     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1290     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1291     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1292     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
1293
1294     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1295     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1296
1297     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1298
1299     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1300     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1301     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
1302     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1303     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1304     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1305     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1306     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1307     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1308     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1309     setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
1310     setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
1311
1312     setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
1313     setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
1314     setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
1315     setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
1316     setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
1317     setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
1318     setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
1319     setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
1320
1321     setOperationAction(ISD::ADD,                MVT::v8i1,  Expand);
1322     setOperationAction(ISD::ADD,                MVT::v16i1, Expand);
1323     setOperationAction(ISD::SUB,                MVT::v8i1,  Expand);
1324     setOperationAction(ISD::SUB,                MVT::v16i1, Expand);
1325     setOperationAction(ISD::MUL,                MVT::v8i1,  Expand);
1326     setOperationAction(ISD::MUL,                MVT::v16i1, Expand);
1327
1328     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1329
1330     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331       setOperationAction(ISD::SRL, VT, Custom);
1332       setOperationAction(ISD::SHL, VT, Custom);
1333       setOperationAction(ISD::SRA, VT, Custom);
1334       setOperationAction(ISD::AND, VT, Legal);
1335       setOperationAction(ISD::OR,  VT, Legal);
1336       setOperationAction(ISD::XOR, VT, Legal);
1337       setOperationAction(ISD::CTPOP, VT, Custom);
1338       setOperationAction(ISD::CTTZ, VT, Custom);
1339     }
1340
1341     if (Subtarget.hasCDI()) {
1342       setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
1343       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1344
1345       setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
1346       setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
1347       setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
1348       setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
1349
1350       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
1351       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
1352
1353       if (Subtarget.hasVLX()) {
1354         setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
1355         setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
1356         setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
1357         setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
1358       } else {
1359         setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
1360         setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
1361         setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
1362         setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
1363       }
1364
1365       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
1366       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
1367       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
1368       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
1369     } // Subtarget.hasCDI()
1370
1371     if (Subtarget.hasDQI()) {
1372       if (Subtarget.hasVLX()) {
1373         setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
1374         setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
1375       }
1376       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
1377     }
1378     // Custom lower several nodes.
1379     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1380                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1381       setOperationAction(ISD::MGATHER,  VT, Custom);
1382       setOperationAction(ISD::MSCATTER, VT, Custom);
1383     }
1384     // Extract subvector is special because the value type
1385     // (result) is 256-bit but the source is 512-bit wide.
1386     // 128-bit was made Custom under AVX1.
1387     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1388                      MVT::v8f32, MVT::v4f64 })
1389       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1390     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1391                      MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1392       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1393
1394     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1395       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1396       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1397       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1398       setOperationAction(ISD::VSELECT,             VT, Legal);
1399       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1400       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1401       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1402       setOperationAction(ISD::MLOAD,               VT, Legal);
1403       setOperationAction(ISD::MSTORE,              VT, Legal);
1404       setOperationAction(ISD::MGATHER,             VT, Legal);
1405       setOperationAction(ISD::MSCATTER,            VT, Custom);
1406     }
1407     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1408       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1409     }
1410   }// has  AVX-512
1411
1412   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1413     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1414     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1415
1416     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1417     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1418
1419     setOperationAction(ISD::ADD,                MVT::v32i1, Expand);
1420     setOperationAction(ISD::ADD,                MVT::v64i1, Expand);
1421     setOperationAction(ISD::SUB,                MVT::v32i1, Expand);
1422     setOperationAction(ISD::SUB,                MVT::v64i1, Expand);
1423     setOperationAction(ISD::MUL,                MVT::v32i1, Expand);
1424     setOperationAction(ISD::MUL,                MVT::v64i1, Expand);
1425
1426     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1427     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1428     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1429     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
1430     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
1431     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
1432     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
1433     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
1434     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
1435     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
1436     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
1437     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
1438     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
1439     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
1440     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1441     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1442     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
1443     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
1444     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
1445     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
1446     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1447     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1448     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
1449     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
1450     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
1451     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
1452     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
1453     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1454     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1455     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
1456     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
1457     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
1458     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
1459     setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
1460     setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
1461     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
1462     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
1463     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
1464     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
1465     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
1466     setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
1467     setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
1468     setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
1469     setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
1470     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
1471
1472     setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
1473     setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
1474     setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
1475     setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
1476     setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
1477     setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
1478     setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
1479     setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
1480
1481     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1482     setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1483     if (Subtarget.hasVLX())
1484       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1485
1486     LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1487     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1488       setOperationAction(ISD::MLOAD,               VT, Action);
1489       setOperationAction(ISD::MSTORE,              VT, Action);
1490     }
1491
1492     if (Subtarget.hasCDI()) {
1493       setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
1494       setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
1495     }
1496
1497     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1498       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1499       setOperationAction(ISD::VSELECT,      VT, Legal);
1500       setOperationAction(ISD::SRL,          VT, Custom);
1501       setOperationAction(ISD::SHL,          VT, Custom);
1502       setOperationAction(ISD::SRA,          VT, Custom);
1503       setOperationAction(ISD::MLOAD,        VT, Legal);
1504       setOperationAction(ISD::MSTORE,       VT, Legal);
1505       setOperationAction(ISD::CTPOP,        VT, Custom);
1506       setOperationAction(ISD::CTTZ,         VT, Custom);
1507
1508       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
1509       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
1510       setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
1511     }
1512
1513     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1514       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1515       if (Subtarget.hasVLX()) {
1516         // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1517         setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1518         setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
1519       }
1520     }
1521   }
1522
1523   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1524     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1525     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1526
1527     setOperationAction(ISD::ADD,                MVT::v2i1, Expand);
1528     setOperationAction(ISD::ADD,                MVT::v4i1, Expand);
1529     setOperationAction(ISD::SUB,                MVT::v2i1, Expand);
1530     setOperationAction(ISD::SUB,                MVT::v4i1, Expand);
1531     setOperationAction(ISD::MUL,                MVT::v2i1, Expand);
1532     setOperationAction(ISD::MUL,                MVT::v4i1, Expand);
1533
1534     setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
1535     setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
1536     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1537     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1538     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
1539     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
1540     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
1541     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
1542     setOperationAction(ISD::SELECT,             MVT::v4i1, Custom);
1543     setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
1544     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
1545     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
1546     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
1547     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
1548     setOperationAction(ISD::VSELECT,            MVT::v2i1, Expand);
1549     setOperationAction(ISD::VSELECT,            MVT::v4i1, Expand);
1550
1551     for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
1552       setOperationAction(ISD::AND, VT, Legal);
1553       setOperationAction(ISD::OR,  VT, Legal);
1554       setOperationAction(ISD::XOR, VT, Legal);
1555     }
1556
1557     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1558       setOperationAction(ISD::SMAX, VT, Legal);
1559       setOperationAction(ISD::UMAX, VT, Legal);
1560       setOperationAction(ISD::SMIN, VT, Legal);
1561       setOperationAction(ISD::UMIN, VT, Legal);
1562     }
1563   }
1564
1565   // We want to custom lower some of our intrinsics.
1566   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1567   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1568   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1569   if (!Subtarget.is64Bit()) {
1570     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1571     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1572   }
1573
1574   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1575   // handle type legalization for these operations here.
1576   //
1577   // FIXME: We really should do custom legalization for addition and
1578   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1579   // than generic legalization for 64-bit multiplication-with-overflow, though.
1580   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1581     if (VT == MVT::i64 && !Subtarget.is64Bit())
1582       continue;
1583     // Add/Sub/Mul with overflow operations are custom lowered.
1584     setOperationAction(ISD::SADDO, VT, Custom);
1585     setOperationAction(ISD::UADDO, VT, Custom);
1586     setOperationAction(ISD::SSUBO, VT, Custom);
1587     setOperationAction(ISD::USUBO, VT, Custom);
1588     setOperationAction(ISD::SMULO, VT, Custom);
1589     setOperationAction(ISD::UMULO, VT, Custom);
1590   }
1591
1592   if (!Subtarget.is64Bit()) {
1593     // These libcalls are not available in 32-bit.
1594     setLibcallName(RTLIB::SHL_I128, nullptr);
1595     setLibcallName(RTLIB::SRL_I128, nullptr);
1596     setLibcallName(RTLIB::SRA_I128, nullptr);
1597   }
1598
1599   // Combine sin / cos into one node or libcall if possible.
1600   if (Subtarget.hasSinCos()) {
1601     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1602     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1603     if (Subtarget.isTargetDarwin()) {
1604       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1605       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1606       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1607       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1608     }
1609   }
1610
1611   if (Subtarget.isTargetWin64()) {
1612     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1613     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1614     setOperationAction(ISD::SREM, MVT::i128, Custom);
1615     setOperationAction(ISD::UREM, MVT::i128, Custom);
1616     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1617     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1618   }
1619
1620   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1621   // is. We should promote the value to 64-bits to solve this.
1622   // This is what the CRT headers do - `fmodf` is an inline header
1623   // function casting to f64 and calling `fmod`.
1624   if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
1625     for (ISD::NodeType Op :
1626          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1627           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1628       if (isOperationExpand(Op, MVT::f32))
1629         setOperationAction(Op, MVT::f32, Promote);
1630
1631   // We have target-specific dag combine patterns for the following nodes:
1632   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1633   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1634   setTargetDAGCombine(ISD::BITCAST);
1635   setTargetDAGCombine(ISD::VSELECT);
1636   setTargetDAGCombine(ISD::SELECT);
1637   setTargetDAGCombine(ISD::SHL);
1638   setTargetDAGCombine(ISD::SRA);
1639   setTargetDAGCombine(ISD::SRL);
1640   setTargetDAGCombine(ISD::OR);
1641   setTargetDAGCombine(ISD::AND);
1642   setTargetDAGCombine(ISD::ADD);
1643   setTargetDAGCombine(ISD::FADD);
1644   setTargetDAGCombine(ISD::FSUB);
1645   setTargetDAGCombine(ISD::FNEG);
1646   setTargetDAGCombine(ISD::FMA);
1647   setTargetDAGCombine(ISD::FMINNUM);
1648   setTargetDAGCombine(ISD::FMAXNUM);
1649   setTargetDAGCombine(ISD::SUB);
1650   setTargetDAGCombine(ISD::LOAD);
1651   setTargetDAGCombine(ISD::MLOAD);
1652   setTargetDAGCombine(ISD::STORE);
1653   setTargetDAGCombine(ISD::MSTORE);
1654   setTargetDAGCombine(ISD::TRUNCATE);
1655   setTargetDAGCombine(ISD::ZERO_EXTEND);
1656   setTargetDAGCombine(ISD::ANY_EXTEND);
1657   setTargetDAGCombine(ISD::SIGN_EXTEND);
1658   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659   setTargetDAGCombine(ISD::SINT_TO_FP);
1660   setTargetDAGCombine(ISD::UINT_TO_FP);
1661   setTargetDAGCombine(ISD::SETCC);
1662   setTargetDAGCombine(ISD::MUL);
1663   setTargetDAGCombine(ISD::XOR);
1664   setTargetDAGCombine(ISD::MSCATTER);
1665   setTargetDAGCombine(ISD::MGATHER);
1666
1667   computeRegisterProperties(Subtarget.getRegisterInfo());
1668
1669   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1670   MaxStoresPerMemsetOptSize = 8;
1671   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1672   MaxStoresPerMemcpyOptSize = 4;
1673   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1674   MaxStoresPerMemmoveOptSize = 4;
1675   setPrefLoopAlignment(4); // 2^4 bytes.
1676
1677   // An out-of-order CPU can speculatively execute past a predictable branch,
1678   // but a conditional move could be stalled by an expensive earlier operation.
1679   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1680   EnableExtLdPromotion = true;
1681   setPrefFunctionAlignment(4); // 2^4 bytes.
1682
1683   verifyIntrinsicTables();
1684 }
1685
1686 // This has so far only been implemented for 64-bit MachO.
1687 bool X86TargetLowering::useLoadStackGuardNode() const {
1688   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1689 }
1690
1691 TargetLoweringBase::LegalizeTypeAction
1692 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1693   if (ExperimentalVectorWideningLegalization &&
1694       VT.getVectorNumElements() != 1 &&
1695       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1696     return TypeWidenVector;
1697
1698   return TargetLoweringBase::getPreferredVectorAction(VT);
1699 }
1700
1701 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1702                                           LLVMContext& Context,
1703                                           EVT VT) const {
1704   if (!VT.isVector())
1705     return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1706
1707   if (VT.isSimple()) {
1708     MVT VVT = VT.getSimpleVT();
1709     const unsigned NumElts = VVT.getVectorNumElements();
1710     MVT EltVT = VVT.getVectorElementType();
1711     if (VVT.is512BitVector()) {
1712       if (Subtarget.hasAVX512())
1713         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1714             EltVT == MVT::f32 || EltVT == MVT::f64)
1715           switch(NumElts) {
1716           case  8: return MVT::v8i1;
1717           case 16: return MVT::v16i1;
1718         }
1719       if (Subtarget.hasBWI())
1720         if (EltVT == MVT::i8 || EltVT == MVT::i16)
1721           switch(NumElts) {
1722           case 32: return MVT::v32i1;
1723           case 64: return MVT::v64i1;
1724         }
1725     }
1726
1727     if (Subtarget.hasBWI() && Subtarget.hasVLX())
1728       return MVT::getVectorVT(MVT::i1, NumElts);
1729
1730     if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1731       EVT LegalVT = getTypeToTransformTo(Context, VT);
1732       EltVT = LegalVT.getVectorElementType().getSimpleVT();
1733     }
1734
1735     if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1736       switch(NumElts) {
1737       case 2: return MVT::v2i1;
1738       case 4: return MVT::v4i1;
1739       case 8: return MVT::v8i1;
1740       }
1741   }
1742
1743   return VT.changeVectorElementTypeToInteger();
1744 }
1745
1746 /// Helper for getByValTypeAlignment to determine
1747 /// the desired ByVal argument alignment.
1748 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1749   if (MaxAlign == 16)
1750     return;
1751   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1752     if (VTy->getBitWidth() == 128)
1753       MaxAlign = 16;
1754   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1755     unsigned EltAlign = 0;
1756     getMaxByValAlign(ATy->getElementType(), EltAlign);
1757     if (EltAlign > MaxAlign)
1758       MaxAlign = EltAlign;
1759   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1760     for (auto *EltTy : STy->elements()) {
1761       unsigned EltAlign = 0;
1762       getMaxByValAlign(EltTy, EltAlign);
1763       if (EltAlign > MaxAlign)
1764         MaxAlign = EltAlign;
1765       if (MaxAlign == 16)
1766         break;
1767     }
1768   }
1769 }
1770
1771 /// Return the desired alignment for ByVal aggregate
1772 /// function arguments in the caller parameter area. For X86, aggregates
1773 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1774 /// are at 4-byte boundaries.
1775 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1776                                                   const DataLayout &DL) const {
1777   if (Subtarget.is64Bit()) {
1778     // Max of 8 and alignment of type.
1779     unsigned TyAlign = DL.getABITypeAlignment(Ty);
1780     if (TyAlign > 8)
1781       return TyAlign;
1782     return 8;
1783   }
1784
1785   unsigned Align = 4;
1786   if (Subtarget.hasSSE1())
1787     getMaxByValAlign(Ty, Align);
1788   return Align;
1789 }
1790
1791 /// Returns the target specific optimal type for load
1792 /// and store operations as a result of memset, memcpy, and memmove
1793 /// lowering. If DstAlign is zero that means it's safe to destination
1794 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1795 /// means there isn't a need to check it against alignment requirement,
1796 /// probably because the source does not need to be loaded. If 'IsMemset' is
1797 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1798 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1799 /// source is constant so it does not need to be loaded.
1800 /// It returns EVT::Other if the type should be determined using generic
1801 /// target-independent logic.
1802 EVT
1803 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1804                                        unsigned DstAlign, unsigned SrcAlign,
1805                                        bool IsMemset, bool ZeroMemset,
1806                                        bool MemcpyStrSrc,
1807                                        MachineFunction &MF) const {
1808   const Function *F = MF.getFunction();
1809   if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1810     if (Size >= 16 &&
1811         (!Subtarget.isUnalignedMem16Slow() ||
1812          ((DstAlign == 0 || DstAlign >= 16) &&
1813           (SrcAlign == 0 || SrcAlign >= 16)))) {
1814       // FIXME: Check if unaligned 32-byte accesses are slow.
1815       if (Size >= 32 && Subtarget.hasAVX()) {
1816         // Although this isn't a well-supported type for AVX1, we'll let
1817         // legalization and shuffle lowering produce the optimal codegen. If we
1818         // choose an optimal type with a vector element larger than a byte,
1819         // getMemsetStores() may create an intermediate splat (using an integer
1820         // multiply) before we splat as a vector.
1821         return MVT::v32i8;
1822       }
1823       if (Subtarget.hasSSE2())
1824         return MVT::v16i8;
1825       // TODO: Can SSE1 handle a byte vector?
1826       if (Subtarget.hasSSE1())
1827         return MVT::v4f32;
1828     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1829                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1830       // Do not use f64 to lower memcpy if source is string constant. It's
1831       // better to use i32 to avoid the loads.
1832       // Also, do not use f64 to lower memset unless this is a memset of zeros.
1833       // The gymnastics of splatting a byte value into an XMM register and then
1834       // only using 8-byte stores (because this is a CPU with slow unaligned
1835       // 16-byte accesses) makes that a loser.
1836       return MVT::f64;
1837     }
1838   }
1839   // This is a compromise. If we reach here, unaligned accesses may be slow on
1840   // this target. However, creating smaller, aligned accesses could be even
1841   // slower and would certainly be a lot more code.
1842   if (Subtarget.is64Bit() && Size >= 8)
1843     return MVT::i64;
1844   return MVT::i32;
1845 }
1846
1847 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1848   if (VT == MVT::f32)
1849     return X86ScalarSSEf32;
1850   else if (VT == MVT::f64)
1851     return X86ScalarSSEf64;
1852   return true;
1853 }
1854
1855 bool
1856 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1857                                                   unsigned,
1858                                                   unsigned,
1859                                                   bool *Fast) const {
1860   if (Fast) {
1861     switch (VT.getSizeInBits()) {
1862     default:
1863       // 8-byte and under are always assumed to be fast.
1864       *Fast = true;
1865       break;
1866     case 128:
1867       *Fast = !Subtarget.isUnalignedMem16Slow();
1868       break;
1869     case 256:
1870       *Fast = !Subtarget.isUnalignedMem32Slow();
1871       break;
1872     // TODO: What about AVX-512 (512-bit) accesses?
1873     }
1874   }
1875   // Misaligned accesses of any size are always allowed.
1876   return true;
1877 }
1878
1879 /// Return the entry encoding for a jump table in the
1880 /// current function.  The returned value is a member of the
1881 /// MachineJumpTableInfo::JTEntryKind enum.
1882 unsigned X86TargetLowering::getJumpTableEncoding() const {
1883   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1884   // symbol.
1885   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1886     return MachineJumpTableInfo::EK_Custom32;
1887
1888   // Otherwise, use the normal jump table encoding heuristics.
1889   return TargetLowering::getJumpTableEncoding();
1890 }
1891
1892 bool X86TargetLowering::useSoftFloat() const {
1893   return Subtarget.useSoftFloat();
1894 }
1895
1896 const MCExpr *
1897 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1898                                              const MachineBasicBlock *MBB,
1899                                              unsigned uid,MCContext &Ctx) const{
1900   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1901   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1902   // entries.
1903   return MCSymbolRefExpr::create(MBB->getSymbol(),
1904                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1905 }
1906
1907 /// Returns relocation base for the given PIC jumptable.
1908 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1909                                                     SelectionDAG &DAG) const {
1910   if (!Subtarget.is64Bit())
1911     // This doesn't have SDLoc associated with it, but is not really the
1912     // same as a Register.
1913     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1914                        getPointerTy(DAG.getDataLayout()));
1915   return Table;
1916 }
1917
1918 /// This returns the relocation base for the given PIC jumptable,
1919 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1920 const MCExpr *X86TargetLowering::
1921 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1922                              MCContext &Ctx) const {
1923   // X86-64 uses RIP relative addressing based on the jump table label.
1924   if (Subtarget.isPICStyleRIPRel())
1925     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1926
1927   // Otherwise, the reference is relative to the PIC base.
1928   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1929 }
1930
1931 std::pair<const TargetRegisterClass *, uint8_t>
1932 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1933                                            MVT VT) const {
1934   const TargetRegisterClass *RRC = nullptr;
1935   uint8_t Cost = 1;
1936   switch (VT.SimpleTy) {
1937   default:
1938     return TargetLowering::findRepresentativeClass(TRI, VT);
1939   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1940     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1941     break;
1942   case MVT::x86mmx:
1943     RRC = &X86::VR64RegClass;
1944     break;
1945   case MVT::f32: case MVT::f64:
1946   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1947   case MVT::v4f32: case MVT::v2f64:
1948   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1949   case MVT::v4f64:
1950     RRC = &X86::VR128RegClass;
1951     break;
1952   }
1953   return std::make_pair(RRC, Cost);
1954 }
1955
1956 unsigned X86TargetLowering::getAddressSpace() const {
1957   if (Subtarget.is64Bit())
1958     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1959   return 256;
1960 }
1961
1962 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1963   // glibc has a special slot for the stack guard in tcbhead_t, use it instead
1964   // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
1965   if (!Subtarget.isTargetGlibc())
1966     return TargetLowering::getIRStackGuard(IRB);
1967
1968   // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1969   // %gs:0x14 on i386
1970   unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
1971   unsigned AddressSpace = getAddressSpace();
1972   return ConstantExpr::getIntToPtr(
1973       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1974       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1975 }
1976
1977 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
1978   // MSVC CRT provides functionalities for stack protection.
1979   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
1980     // MSVC CRT has a global variable holding security cookie.
1981     M.getOrInsertGlobal("__security_cookie",
1982                         Type::getInt8PtrTy(M.getContext()));
1983
1984     // MSVC CRT has a function to validate security cookie.
1985     auto *SecurityCheckCookie = cast<Function>(
1986         M.getOrInsertFunction("__security_check_cookie",
1987                               Type::getVoidTy(M.getContext()),
1988                               Type::getInt8PtrTy(M.getContext()), nullptr));
1989     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
1990     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
1991     return;
1992   }
1993   // glibc has a special slot for the stack guard.
1994   if (Subtarget.isTargetGlibc())
1995     return;
1996   TargetLowering::insertSSPDeclarations(M);
1997 }
1998
1999 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2000   // MSVC CRT has a global variable holding security cookie.
2001   if (Subtarget.getTargetTriple().isOSMSVCRT())
2002     return M.getGlobalVariable("__security_cookie");
2003   return TargetLowering::getSDagStackGuard(M);
2004 }
2005
2006 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2007   // MSVC CRT has a function to validate security cookie.
2008   if (Subtarget.getTargetTriple().isOSMSVCRT())
2009     return M.getFunction("__security_check_cookie");
2010   return TargetLowering::getSSPStackGuardCheck(M);
2011 }
2012
2013 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2014   if (!Subtarget.isTargetAndroid())
2015     return TargetLowering::getSafeStackPointerLocation(IRB);
2016
2017   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2018   // definition of TLS_SLOT_SAFESTACK in
2019   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2020   unsigned AddressSpace, Offset;
2021
2022   // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2023   // %gs:0x24 on i386
2024   Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2025   AddressSpace = getAddressSpace();
2026   return ConstantExpr::getIntToPtr(
2027       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2028       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2029 }
2030
2031 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2032                                             unsigned DestAS) const {
2033   assert(SrcAS != DestAS && "Expected different address spaces!");
2034
2035   return SrcAS < 256 && DestAS < 256;
2036 }
2037
2038 //===----------------------------------------------------------------------===//
2039 //               Return Value Calling Convention Implementation
2040 //===----------------------------------------------------------------------===//
2041
2042 #include "X86GenCallingConv.inc"
2043
2044 bool X86TargetLowering::CanLowerReturn(
2045     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2046     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2047   SmallVector<CCValAssign, 16> RVLocs;
2048   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2049   return CCInfo.CheckReturn(Outs, RetCC_X86);
2050 }
2051
2052 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2053   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2054   return ScratchRegs;
2055 }
2056
2057 SDValue
2058 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2059                                bool isVarArg,
2060                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2061                                const SmallVectorImpl<SDValue> &OutVals,
2062                                const SDLoc &dl, SelectionDAG &DAG) const {
2063   MachineFunction &MF = DAG.getMachineFunction();
2064   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2065
2066   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2067     report_fatal_error("X86 interrupts may not return any value");
2068
2069   SmallVector<CCValAssign, 16> RVLocs;
2070   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2071   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2072
2073   SDValue Flag;
2074   SmallVector<SDValue, 6> RetOps;
2075   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2076   // Operand #1 = Bytes To Pop
2077   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2078                    MVT::i32));
2079
2080   // Copy the result values into the output registers.
2081   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2082     CCValAssign &VA = RVLocs[i];
2083     assert(VA.isRegLoc() && "Can only return in registers!");
2084     SDValue ValToCopy = OutVals[i];
2085     EVT ValVT = ValToCopy.getValueType();
2086
2087     // Promote values to the appropriate types.
2088     if (VA.getLocInfo() == CCValAssign::SExt)
2089       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2090     else if (VA.getLocInfo() == CCValAssign::ZExt)
2091       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2092     else if (VA.getLocInfo() == CCValAssign::AExt) {
2093       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2094         ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2095       else
2096         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2097     }
2098     else if (VA.getLocInfo() == CCValAssign::BCvt)
2099       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2100
2101     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2102            "Unexpected FP-extend for return value.");
2103
2104     // If this is x86-64, and we disabled SSE, we can't return FP values,
2105     // or SSE or MMX vectors.
2106     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2107          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2108           (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2109       report_fatal_error("SSE register return with SSE disabled");
2110     }
2111     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2112     // llvm-gcc has never done it right and no one has noticed, so this
2113     // should be OK for now.
2114     if (ValVT == MVT::f64 &&
2115         (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2116       report_fatal_error("SSE2 register return with SSE2 disabled");
2117
2118     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2119     // the RET instruction and handled by the FP Stackifier.
2120     if (VA.getLocReg() == X86::FP0 ||
2121         VA.getLocReg() == X86::FP1) {
2122       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2123       // change the value to the FP stack register class.
2124       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2125         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2126       RetOps.push_back(ValToCopy);
2127       // Don't emit a copytoreg.
2128       continue;
2129     }
2130
2131     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2132     // which is returned in RAX / RDX.
2133     if (Subtarget.is64Bit()) {
2134       if (ValVT == MVT::x86mmx) {
2135         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2136           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2137           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2138                                   ValToCopy);
2139           // If we don't have SSE2 available, convert to v4f32 so the generated
2140           // register is legal.
2141           if (!Subtarget.hasSSE2())
2142             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2143         }
2144       }
2145     }
2146
2147     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2148     Flag = Chain.getValue(1);
2149     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2150   }
2151
2152   // Swift calling convention does not require we copy the sret argument
2153   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2154
2155   // All x86 ABIs require that for returning structs by value we copy
2156   // the sret argument into %rax/%eax (depending on ABI) for the return.
2157   // We saved the argument into a virtual register in the entry block,
2158   // so now we copy the value out and into %rax/%eax.
2159   //
2160   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2161   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2162   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2163   // either case FuncInfo->setSRetReturnReg() will have been called.
2164   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2165     // When we have both sret and another return value, we should use the
2166     // original Chain stored in RetOps[0], instead of the current Chain updated
2167     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2168
2169     // For the case of sret and another return value, we have
2170     //   Chain_0 at the function entry
2171     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2172     // If we use Chain_1 in getCopyFromReg, we will have
2173     //   Val = getCopyFromReg(Chain_1)
2174     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2175
2176     // getCopyToReg(Chain_0) will be glued together with
2177     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2178     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2179     //   Data dependency from Unit B to Unit A due to usage of Val in
2180     //     getCopyToReg(Chain_1, Val)
2181     //   Chain dependency from Unit A to Unit B
2182
2183     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2184     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2185                                      getPointerTy(MF.getDataLayout()));
2186
2187     unsigned RetValReg
2188         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2189           X86::RAX : X86::EAX;
2190     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2191     Flag = Chain.getValue(1);
2192
2193     // RAX/EAX now acts like a return value.
2194     RetOps.push_back(
2195         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2196   }
2197
2198   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2199   const MCPhysReg *I =
2200       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2201   if (I) {
2202     for (; *I; ++I) {
2203       if (X86::GR64RegClass.contains(*I))
2204         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2205       else
2206         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2207     }
2208   }
2209
2210   RetOps[0] = Chain;  // Update chain.
2211
2212   // Add the flag if we have it.
2213   if (Flag.getNode())
2214     RetOps.push_back(Flag);
2215
2216   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2217   if (CallConv == CallingConv::X86_INTR)
2218     opcode = X86ISD::IRET;
2219   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2220 }
2221
2222 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2223   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2224     return false;
2225
2226   SDValue TCChain = Chain;
2227   SDNode *Copy = *N->use_begin();
2228   if (Copy->getOpcode() == ISD::CopyToReg) {
2229     // If the copy has a glue operand, we conservatively assume it isn't safe to
2230     // perform a tail call.
2231     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2232       return false;
2233     TCChain = Copy->getOperand(0);
2234   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2235     return false;
2236
2237   bool HasRet = false;
2238   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2239        UI != UE; ++UI) {
2240     if (UI->getOpcode() != X86ISD::RET_FLAG)
2241       return false;
2242     // If we are returning more than one value, we can definitely
2243     // not make a tail call see PR19530
2244     if (UI->getNumOperands() > 4)
2245       return false;
2246     if (UI->getNumOperands() == 4 &&
2247         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2248       return false;
2249     HasRet = true;
2250   }
2251
2252   if (!HasRet)
2253     return false;
2254
2255   Chain = TCChain;
2256   return true;
2257 }
2258
2259 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2260                                            ISD::NodeType ExtendKind) const {
2261   MVT ReturnMVT = MVT::i32;
2262
2263   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2264   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2265     // The ABI does not require i1, i8 or i16 to be extended.
2266     //
2267     // On Darwin, there is code in the wild relying on Clang's old behaviour of
2268     // always extending i8/i16 return values, so keep doing that for now.
2269     // (PR26665).
2270     ReturnMVT = MVT::i8;
2271   }
2272
2273   EVT MinVT = getRegisterType(Context, ReturnMVT);
2274   return VT.bitsLT(MinVT) ? MinVT : VT;
2275 }
2276
2277 /// Lower the result values of a call into the
2278 /// appropriate copies out of appropriate physical registers.
2279 ///
2280 SDValue X86TargetLowering::LowerCallResult(
2281     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2282     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2283     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2284
2285   // Assign locations to each value returned by this call.
2286   SmallVector<CCValAssign, 16> RVLocs;
2287   bool Is64Bit = Subtarget.is64Bit();
2288   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2289                  *DAG.getContext());
2290   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2291
2292   // Copy all of the result registers out of their specified physreg.
2293   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2294     CCValAssign &VA = RVLocs[i];
2295     EVT CopyVT = VA.getLocVT();
2296
2297     // If this is x86-64, and we disabled SSE, we can't return FP values
2298     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2299         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2300       report_fatal_error("SSE register return with SSE disabled");
2301     }
2302
2303     // If we prefer to use the value in xmm registers, copy it out as f80 and
2304     // use a truncate to move it from fp stack reg to xmm reg.
2305     bool RoundAfterCopy = false;
2306     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2307         isScalarFPTypeInSSEReg(VA.getValVT())) {
2308       if (!Subtarget.hasX87())
2309         report_fatal_error("X87 register return with X87 disabled");
2310       CopyVT = MVT::f80;
2311       RoundAfterCopy = (CopyVT != VA.getLocVT());
2312     }
2313
2314     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2315                                CopyVT, InFlag).getValue(1);
2316     SDValue Val = Chain.getValue(0);
2317
2318     if (RoundAfterCopy)
2319       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2320                         // This truncation won't change the value.
2321                         DAG.getIntPtrConstant(1, dl));
2322
2323     if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
2324       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2325
2326     InFlag = Chain.getValue(2);
2327     InVals.push_back(Val);
2328   }
2329
2330   return Chain;
2331 }
2332
2333 //===----------------------------------------------------------------------===//
2334 //                C & StdCall & Fast Calling Convention implementation
2335 //===----------------------------------------------------------------------===//
2336 //  StdCall calling convention seems to be standard for many Windows' API
2337 //  routines and around. It differs from C calling convention just a little:
2338 //  callee should clean up the stack, not caller. Symbols should be also
2339 //  decorated in some fancy way :) It doesn't support any vector arguments.
2340 //  For info on fast calling convention see Fast Calling Convention (tail call)
2341 //  implementation LowerX86_32FastCCCallTo.
2342
2343 /// CallIsStructReturn - Determines whether a call uses struct return
2344 /// semantics.
2345 enum StructReturnType {
2346   NotStructReturn,
2347   RegStructReturn,
2348   StackStructReturn
2349 };
2350 static StructReturnType
2351 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2352   if (Outs.empty())
2353     return NotStructReturn;
2354
2355   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2356   if (!Flags.isSRet())
2357     return NotStructReturn;
2358   if (Flags.isInReg() || IsMCU)
2359     return RegStructReturn;
2360   return StackStructReturn;
2361 }
2362
2363 /// Determines whether a function uses struct return semantics.
2364 static StructReturnType
2365 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2366   if (Ins.empty())
2367     return NotStructReturn;
2368
2369   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2370   if (!Flags.isSRet())
2371     return NotStructReturn;
2372   if (Flags.isInReg() || IsMCU)
2373     return RegStructReturn;
2374   return StackStructReturn;
2375 }
2376
2377 /// Make a copy of an aggregate at address specified by "Src" to address
2378 /// "Dst" with size and alignment information specified by the specific
2379 /// parameter attribute. The copy will be passed as a byval function parameter.
2380 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2381                                          SDValue Chain, ISD::ArgFlagsTy Flags,
2382                                          SelectionDAG &DAG, const SDLoc &dl) {
2383   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2384
2385   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2386                        /*isVolatile*/false, /*AlwaysInline=*/true,
2387                        /*isTailCall*/false,
2388                        MachinePointerInfo(), MachinePointerInfo());
2389 }
2390
2391 /// Return true if the calling convention is one that we can guarantee TCO for.
2392 static bool canGuaranteeTCO(CallingConv::ID CC) {
2393   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2394           CC == CallingConv::HiPE || CC == CallingConv::HHVM);
2395 }
2396
2397 /// Return true if we might ever do TCO for calls with this calling convention.
2398 static bool mayTailCallThisCC(CallingConv::ID CC) {
2399   switch (CC) {
2400   // C calling conventions:
2401   case CallingConv::C:
2402   case CallingConv::X86_64_Win64:
2403   case CallingConv::X86_64_SysV:
2404   // Callee pop conventions:
2405   case CallingConv::X86_ThisCall:
2406   case CallingConv::X86_StdCall:
2407   case CallingConv::X86_VectorCall:
2408   case CallingConv::X86_FastCall:
2409     return true;
2410   default:
2411     return canGuaranteeTCO(CC);
2412   }
2413 }
2414
2415 /// Return true if the function is being made into a tailcall target by
2416 /// changing its ABI.
2417 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2418   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2419 }
2420
2421 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2422   auto Attr =
2423       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2424   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2425     return false;
2426
2427   CallSite CS(CI);
2428   CallingConv::ID CalleeCC = CS.getCallingConv();
2429   if (!mayTailCallThisCC(CalleeCC))
2430     return false;
2431
2432   return true;
2433 }
2434
2435 SDValue
2436 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2437                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2438                                     const SDLoc &dl, SelectionDAG &DAG,
2439                                     const CCValAssign &VA,
2440                                     MachineFrameInfo *MFI, unsigned i) const {
2441   // Create the nodes corresponding to a load from this parameter slot.
2442   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2443   bool AlwaysUseMutable = shouldGuaranteeTCO(
2444       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2445   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2446   EVT ValVT;
2447
2448   // If value is passed by pointer we have address passed instead of the value
2449   // itself.
2450   bool ExtendedInMem = VA.isExtInLoc() &&
2451     VA.getValVT().getScalarType() == MVT::i1;
2452
2453   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2454     ValVT = VA.getLocVT();
2455   else
2456     ValVT = VA.getValVT();
2457
2458   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2459   // taken by a return address.
2460   int Offset = 0;
2461   if (CallConv == CallingConv::X86_INTR) {
2462     const X86Subtarget& Subtarget =
2463         static_cast<const X86Subtarget&>(DAG.getSubtarget());
2464     // X86 interrupts may take one or two arguments.
2465     // On the stack there will be no return address as in regular call.
2466     // Offset of last argument need to be set to -4/-8 bytes.
2467     // Where offset of the first argument out of two, should be set to 0 bytes.
2468     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2469   }
2470
2471   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2472   // changed with more analysis.
2473   // In case of tail call optimization mark all arguments mutable. Since they
2474   // could be overwritten by lowering of arguments in case of a tail call.
2475   if (Flags.isByVal()) {
2476     unsigned Bytes = Flags.getByValSize();
2477     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2478     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2479     // Adjust SP offset of interrupt parameter.
2480     if (CallConv == CallingConv::X86_INTR) {
2481       MFI->setObjectOffset(FI, Offset);
2482     }
2483     return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2484   } else {
2485     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2486                                     VA.getLocMemOffset(), isImmutable);
2487
2488     // Set SExt or ZExt flag.
2489     if (VA.getLocInfo() == CCValAssign::ZExt) {
2490       MFI->setObjectZExt(FI, true);
2491     } else if (VA.getLocInfo() == CCValAssign::SExt) {
2492       MFI->setObjectSExt(FI, true);
2493     }
2494
2495     // Adjust SP offset of interrupt parameter.
2496     if (CallConv == CallingConv::X86_INTR) {
2497       MFI->setObjectOffset(FI, Offset);
2498     }
2499
2500     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2501     SDValue Val = DAG.getLoad(
2502         ValVT, dl, Chain, FIN,
2503         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2504     return ExtendedInMem ?
2505       DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2506   }
2507 }
2508
2509 // FIXME: Get this from tablegen.
2510 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2511                                                 const X86Subtarget &Subtarget) {
2512   assert(Subtarget.is64Bit());
2513
2514   if (Subtarget.isCallingConvWin64(CallConv)) {
2515     static const MCPhysReg GPR64ArgRegsWin64[] = {
2516       X86::RCX, X86::RDX, X86::R8,  X86::R9
2517     };
2518     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2519   }
2520
2521   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2522     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2523   };
2524   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2525 }
2526
2527 // FIXME: Get this from tablegen.
2528 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2529                                                 CallingConv::ID CallConv,
2530                                                 const X86Subtarget &Subtarget) {
2531   assert(Subtarget.is64Bit());
2532   if (Subtarget.isCallingConvWin64(CallConv)) {
2533     // The XMM registers which might contain var arg parameters are shadowed
2534     // in their paired GPR.  So we only need to save the GPR to their home
2535     // slots.
2536     // TODO: __vectorcall will change this.
2537     return None;
2538   }
2539
2540   const Function *Fn = MF.getFunction();
2541   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2542   bool isSoftFloat = Subtarget.useSoftFloat();
2543   assert(!(isSoftFloat && NoImplicitFloatOps) &&
2544          "SSE register cannot be used when SSE is disabled!");
2545   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2546     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2547     // registers.
2548     return None;
2549
2550   static const MCPhysReg XMMArgRegs64Bit[] = {
2551     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2552     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2553   };
2554   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2555 }
2556
2557 SDValue X86TargetLowering::LowerFormalArguments(
2558     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2559     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2560     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2561   MachineFunction &MF = DAG.getMachineFunction();
2562   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2563   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2564
2565   const Function *Fn = MF.getFunction();
2566   if (Fn->hasExternalLinkage() &&
2567       Subtarget.isTargetCygMing() &&
2568       Fn->getName() == "main")
2569     FuncInfo->setForceFramePointer(true);
2570
2571   MachineFrameInfo *MFI = MF.getFrameInfo();
2572   bool Is64Bit = Subtarget.is64Bit();
2573   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2574
2575   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2576          "Var args not supported with calling convention fastcc, ghc or hipe");
2577
2578   if (CallConv == CallingConv::X86_INTR) {
2579     bool isLegal = Ins.size() == 1 ||
2580                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2581                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
2582     if (!isLegal)
2583       report_fatal_error("X86 interrupts may take one or two arguments");
2584   }
2585
2586   // Assign locations to all of the incoming arguments.
2587   SmallVector<CCValAssign, 16> ArgLocs;
2588   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2589
2590   // Allocate shadow area for Win64
2591   if (IsWin64)
2592     CCInfo.AllocateStack(32, 8);
2593
2594   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2595
2596   unsigned LastVal = ~0U;
2597   SDValue ArgValue;
2598   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2599     CCValAssign &VA = ArgLocs[i];
2600     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2601     // places.
2602     assert(VA.getValNo() != LastVal &&
2603            "Don't support value assigned to multiple locs yet");
2604     (void)LastVal;
2605     LastVal = VA.getValNo();
2606
2607     if (VA.isRegLoc()) {
2608       EVT RegVT = VA.getLocVT();
2609       const TargetRegisterClass *RC;
2610       if (RegVT == MVT::i32)
2611         RC = &X86::GR32RegClass;
2612       else if (Is64Bit && RegVT == MVT::i64)
2613         RC = &X86::GR64RegClass;
2614       else if (RegVT == MVT::f32)
2615         RC = &X86::FR32RegClass;
2616       else if (RegVT == MVT::f64)
2617         RC = &X86::FR64RegClass;
2618       else if (RegVT == MVT::f128)
2619         RC = &X86::FR128RegClass;
2620       else if (RegVT.is512BitVector())
2621         RC = &X86::VR512RegClass;
2622       else if (RegVT.is256BitVector())
2623         RC = &X86::VR256RegClass;
2624       else if (RegVT.is128BitVector())
2625         RC = &X86::VR128RegClass;
2626       else if (RegVT == MVT::x86mmx)
2627         RC = &X86::VR64RegClass;
2628       else if (RegVT == MVT::i1)
2629         RC = &X86::VK1RegClass;
2630       else if (RegVT == MVT::v8i1)
2631         RC = &X86::VK8RegClass;
2632       else if (RegVT == MVT::v16i1)
2633         RC = &X86::VK16RegClass;
2634       else if (RegVT == MVT::v32i1)
2635         RC = &X86::VK32RegClass;
2636       else if (RegVT == MVT::v64i1)
2637         RC = &X86::VK64RegClass;
2638       else
2639         llvm_unreachable("Unknown argument type!");
2640
2641       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2642       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2643
2644       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2645       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2646       // right size.
2647       if (VA.getLocInfo() == CCValAssign::SExt)
2648         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2649                                DAG.getValueType(VA.getValVT()));
2650       else if (VA.getLocInfo() == CCValAssign::ZExt)
2651         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2652                                DAG.getValueType(VA.getValVT()));
2653       else if (VA.getLocInfo() == CCValAssign::BCvt)
2654         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2655
2656       if (VA.isExtInLoc()) {
2657         // Handle MMX values passed in XMM regs.
2658         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2659           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2660         else
2661           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2662       }
2663     } else {
2664       assert(VA.isMemLoc());
2665       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2666     }
2667
2668     // If value is passed via pointer - do a load.
2669     if (VA.getLocInfo() == CCValAssign::Indirect)
2670       ArgValue =
2671           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
2672
2673     InVals.push_back(ArgValue);
2674   }
2675
2676   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2677     // Swift calling convention does not require we copy the sret argument
2678     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2679     if (CallConv == CallingConv::Swift)
2680       continue;
2681
2682     // All x86 ABIs require that for returning structs by value we copy the
2683     // sret argument into %rax/%eax (depending on ABI) for the return. Save
2684     // the argument into a virtual register so that we can access it from the
2685     // return points.
2686     if (Ins[i].Flags.isSRet()) {
2687       unsigned Reg = FuncInfo->getSRetReturnReg();
2688       if (!Reg) {
2689         MVT PtrTy = getPointerTy(DAG.getDataLayout());
2690         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2691         FuncInfo->setSRetReturnReg(Reg);
2692       }
2693       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2694       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2695       break;
2696     }
2697   }
2698
2699   unsigned StackSize = CCInfo.getNextStackOffset();
2700   // Align stack specially for tail calls.
2701   if (shouldGuaranteeTCO(CallConv,
2702                          MF.getTarget().Options.GuaranteedTailCallOpt))
2703     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2704
2705   // If the function takes variable number of arguments, make a frame index for
2706   // the start of the first vararg value... for expansion of llvm.va_start. We
2707   // can skip this if there are no va_start calls.
2708   if (MFI->hasVAStart() &&
2709       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2710                    CallConv != CallingConv::X86_ThisCall))) {
2711     FuncInfo->setVarArgsFrameIndex(
2712         MFI->CreateFixedObject(1, StackSize, true));
2713   }
2714
2715   // Figure out if XMM registers are in use.
2716   assert(!(Subtarget.useSoftFloat() &&
2717            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2718          "SSE register cannot be used when SSE is disabled!");
2719
2720   // 64-bit calling conventions support varargs and register parameters, so we
2721   // have to do extra work to spill them in the prologue.
2722   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2723     // Find the first unallocated argument registers.
2724     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2725     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2726     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2727     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2728     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2729            "SSE register cannot be used when SSE is disabled!");
2730
2731     // Gather all the live in physical registers.
2732     SmallVector<SDValue, 6> LiveGPRs;
2733     SmallVector<SDValue, 8> LiveXMMRegs;
2734     SDValue ALVal;
2735     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2736       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2737       LiveGPRs.push_back(
2738           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2739     }
2740     if (!ArgXMMs.empty()) {
2741       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2742       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2743       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2744         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2745         LiveXMMRegs.push_back(
2746             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2747       }
2748     }
2749
2750     if (IsWin64) {
2751       // Get to the caller-allocated home save location.  Add 8 to account
2752       // for the return address.
2753       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2754       FuncInfo->setRegSaveFrameIndex(
2755           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2756       // Fixup to set vararg frame on shadow area (4 x i64).
2757       if (NumIntRegs < 4)
2758         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2759     } else {
2760       // For X86-64, if there are vararg parameters that are passed via
2761       // registers, then we must store them to their spots on the stack so
2762       // they may be loaded by dereferencing the result of va_next.
2763       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2764       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2765       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2766           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2767     }
2768
2769     // Store the integer parameter registers.
2770     SmallVector<SDValue, 8> MemOps;
2771     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2772                                       getPointerTy(DAG.getDataLayout()));
2773     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2774     for (SDValue Val : LiveGPRs) {
2775       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2776                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
2777       SDValue Store =
2778           DAG.getStore(Val.getValue(1), dl, Val, FIN,
2779                        MachinePointerInfo::getFixedStack(
2780                            DAG.getMachineFunction(),
2781                            FuncInfo->getRegSaveFrameIndex(), Offset));
2782       MemOps.push_back(Store);
2783       Offset += 8;
2784     }
2785
2786     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2787       // Now store the XMM (fp + vector) parameter registers.
2788       SmallVector<SDValue, 12> SaveXMMOps;
2789       SaveXMMOps.push_back(Chain);
2790       SaveXMMOps.push_back(ALVal);
2791       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2792                              FuncInfo->getRegSaveFrameIndex(), dl));
2793       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2794                              FuncInfo->getVarArgsFPOffset(), dl));
2795       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2796                         LiveXMMRegs.end());
2797       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2798                                    MVT::Other, SaveXMMOps));
2799     }
2800
2801     if (!MemOps.empty())
2802       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2803   }
2804
2805   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2806     // Find the largest legal vector type.
2807     MVT VecVT = MVT::Other;
2808     // FIXME: Only some x86_32 calling conventions support AVX512.
2809     if (Subtarget.hasAVX512() &&
2810         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2811                      CallConv == CallingConv::Intel_OCL_BI)))
2812       VecVT = MVT::v16f32;
2813     else if (Subtarget.hasAVX())
2814       VecVT = MVT::v8f32;
2815     else if (Subtarget.hasSSE2())
2816       VecVT = MVT::v4f32;
2817
2818     // We forward some GPRs and some vector types.
2819     SmallVector<MVT, 2> RegParmTypes;
2820     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2821     RegParmTypes.push_back(IntVT);
2822     if (VecVT != MVT::Other)
2823       RegParmTypes.push_back(VecVT);
2824
2825     // Compute the set of forwarded registers. The rest are scratch.
2826     SmallVectorImpl<ForwardedRegister> &Forwards =
2827         FuncInfo->getForwardedMustTailRegParms();
2828     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2829
2830     // Conservatively forward AL on x86_64, since it might be used for varargs.
2831     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2832       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2833       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2834     }
2835
2836     // Copy all forwards from physical to virtual registers.
2837     for (ForwardedRegister &F : Forwards) {
2838       // FIXME: Can we use a less constrained schedule?
2839       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2840       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2841       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2842     }
2843   }
2844
2845   // Some CCs need callee pop.
2846   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2847                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2848     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2849   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
2850     // X86 interrupts must pop the error code if present
2851     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
2852   } else {
2853     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2854     // If this is an sret function, the return should pop the hidden pointer.
2855     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
2856         !Subtarget.getTargetTriple().isOSMSVCRT() &&
2857         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
2858       FuncInfo->setBytesToPopOnReturn(4);
2859   }
2860
2861   if (!Is64Bit) {
2862     // RegSaveFrameIndex is X86-64 only.
2863     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2864     if (CallConv == CallingConv::X86_FastCall ||
2865         CallConv == CallingConv::X86_ThisCall)
2866       // fastcc functions can't have varargs.
2867       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2868   }
2869
2870   FuncInfo->setArgumentStackSize(StackSize);
2871
2872   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
2873     EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
2874     if (Personality == EHPersonality::CoreCLR) {
2875       assert(Is64Bit);
2876       // TODO: Add a mechanism to frame lowering that will allow us to indicate
2877       // that we'd prefer this slot be allocated towards the bottom of the frame
2878       // (i.e. near the stack pointer after allocating the frame).  Every
2879       // funclet needs a copy of this slot in its (mostly empty) frame, and the
2880       // offset from the bottom of this and each funclet's frame must be the
2881       // same, so the size of funclets' (mostly empty) frames is dictated by
2882       // how far this slot is from the bottom (since they allocate just enough
2883       // space to accommodate holding this slot at the correct offset).
2884       int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
2885       EHInfo->PSPSymFrameIdx = PSPSymFI;
2886     }
2887   }
2888
2889   return Chain;
2890 }
2891
2892 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
2893                                             SDValue Arg, const SDLoc &dl,
2894                                             SelectionDAG &DAG,
2895                                             const CCValAssign &VA,
2896                                             ISD::ArgFlagsTy Flags) const {
2897   unsigned LocMemOffset = VA.getLocMemOffset();
2898   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2899   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2900                        StackPtr, PtrOff);
2901   if (Flags.isByVal())
2902     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2903
2904   return DAG.getStore(
2905       Chain, dl, Arg, PtrOff,
2906       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
2907 }
2908
2909 /// Emit a load of return address if tail call
2910 /// optimization is performed and it is required.
2911 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
2912     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
2913     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
2914   // Adjust the Return address stack slot.
2915   EVT VT = getPointerTy(DAG.getDataLayout());
2916   OutRetAddr = getReturnAddressFrameIndex(DAG);
2917
2918   // Load the "old" Return address.
2919   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
2920   return SDValue(OutRetAddr.getNode(), 1);
2921 }
2922
2923 /// Emit a store of the return address if tail call
2924 /// optimization is performed and it is required (FPDiff!=0).
2925 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2926                                         SDValue Chain, SDValue RetAddrFrIdx,
2927                                         EVT PtrVT, unsigned SlotSize,
2928                                         int FPDiff, const SDLoc &dl) {
2929   // Store the return address to the appropriate stack slot.
2930   if (!FPDiff) return Chain;
2931   // Calculate the new stack slot for the return address.
2932   int NewReturnAddrFI =
2933     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2934                                          false);
2935   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2936   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2937                        MachinePointerInfo::getFixedStack(
2938                            DAG.getMachineFunction(), NewReturnAddrFI));
2939   return Chain;
2940 }
2941
2942 /// Returns a vector_shuffle mask for an movs{s|d}, movd
2943 /// operation of specified width.
2944 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
2945                        SDValue V2) {
2946   unsigned NumElems = VT.getVectorNumElements();
2947   SmallVector<int, 8> Mask;
2948   Mask.push_back(NumElems);
2949   for (unsigned i = 1; i != NumElems; ++i)
2950     Mask.push_back(i);
2951   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
2952 }
2953
2954 SDValue
2955 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2956                              SmallVectorImpl<SDValue> &InVals) const {
2957   SelectionDAG &DAG                     = CLI.DAG;
2958   SDLoc &dl                             = CLI.DL;
2959   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2960   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2961   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2962   SDValue Chain                         = CLI.Chain;
2963   SDValue Callee                        = CLI.Callee;
2964   CallingConv::ID CallConv              = CLI.CallConv;
2965   bool &isTailCall                      = CLI.IsTailCall;
2966   bool isVarArg                         = CLI.IsVarArg;
2967
2968   MachineFunction &MF = DAG.getMachineFunction();
2969   bool Is64Bit        = Subtarget.is64Bit();
2970   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
2971   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
2972   bool IsSibcall      = false;
2973   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2974   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
2975
2976   if (CallConv == CallingConv::X86_INTR)
2977     report_fatal_error("X86 interrupts may not be called directly");
2978
2979   if (Attr.getValueAsString() == "true")
2980     isTailCall = false;
2981
2982   if (Subtarget.isPICStyleGOT() &&
2983       !MF.getTarget().Options.GuaranteedTailCallOpt) {
2984     // If we are using a GOT, disable tail calls to external symbols with
2985     // default visibility. Tail calling such a symbol requires using a GOT
2986     // relocation, which forces early binding of the symbol. This breaks code
2987     // that require lazy function symbol resolution. Using musttail or
2988     // GuaranteedTailCallOpt will override this.
2989     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2990     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2991                G->getGlobal()->hasDefaultVisibility()))
2992       isTailCall = false;
2993   }
2994
2995   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2996   if (IsMustTail) {
2997     // Force this to be a tail call.  The verifier rules are enough to ensure
2998     // that we can lower this successfully without moving the return address
2999     // around.
3000     isTailCall = true;
3001   } else if (isTailCall) {
3002     // Check if it's really possible to do a tail call.
3003     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3004                     isVarArg, SR != NotStructReturn,
3005                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3006                     Outs, OutVals, Ins, DAG);
3007
3008     // Sibcalls are automatically detected tailcalls which do not require
3009     // ABI changes.
3010     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3011       IsSibcall = true;
3012
3013     if (isTailCall)
3014       ++NumTailCalls;
3015   }
3016
3017   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3018          "Var args not supported with calling convention fastcc, ghc or hipe");
3019
3020   // Analyze operands of the call, assigning locations to each operand.
3021   SmallVector<CCValAssign, 16> ArgLocs;
3022   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3023
3024   // Allocate shadow area for Win64
3025   if (IsWin64)
3026     CCInfo.AllocateStack(32, 8);
3027
3028   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3029
3030   // Get a count of how many bytes are to be pushed on the stack.
3031   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3032   if (IsSibcall)
3033     // This is a sibcall. The memory operands are available in caller's
3034     // own caller's stack.
3035     NumBytes = 0;
3036   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3037            canGuaranteeTCO(CallConv))
3038     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3039
3040   int FPDiff = 0;
3041   if (isTailCall && !IsSibcall && !IsMustTail) {
3042     // Lower arguments at fp - stackoffset + fpdiff.
3043     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3044
3045     FPDiff = NumBytesCallerPushed - NumBytes;
3046
3047     // Set the delta of movement of the returnaddr stackslot.
3048     // But only set if delta is greater than previous delta.
3049     if (FPDiff < X86Info->getTCReturnAddrDelta())
3050       X86Info->setTCReturnAddrDelta(FPDiff);
3051   }
3052
3053   unsigned NumBytesToPush = NumBytes;
3054   unsigned NumBytesToPop = NumBytes;
3055
3056   // If we have an inalloca argument, all stack space has already been allocated
3057   // for us and be right at the top of the stack.  We don't support multiple
3058   // arguments passed in memory when using inalloca.
3059   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3060     NumBytesToPush = 0;
3061     if (!ArgLocs.back().isMemLoc())
3062       report_fatal_error("cannot use inalloca attribute on a register "
3063                          "parameter");
3064     if (ArgLocs.back().getLocMemOffset() != 0)
3065       report_fatal_error("any parameter with the inalloca attribute must be "
3066                          "the only memory argument");
3067   }
3068
3069   if (!IsSibcall)
3070     Chain = DAG.getCALLSEQ_START(
3071         Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3072
3073   SDValue RetAddrFrIdx;
3074   // Load return address for tail calls.
3075   if (isTailCall && FPDiff)
3076     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3077                                     Is64Bit, FPDiff, dl);
3078
3079   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3080   SmallVector<SDValue, 8> MemOpChains;
3081   SDValue StackPtr;
3082
3083   // Walk the register/memloc assignments, inserting copies/loads.  In the case
3084   // of tail call optimization arguments are handle later.
3085   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3086   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3087     // Skip inalloca arguments, they have already been written.
3088     ISD::ArgFlagsTy Flags = Outs[i].Flags;
3089     if (Flags.isInAlloca())
3090       continue;
3091
3092     CCValAssign &VA = ArgLocs[i];
3093     EVT RegVT = VA.getLocVT();
3094     SDValue Arg = OutVals[i];
3095     bool isByVal = Flags.isByVal();
3096
3097     // Promote the value if needed.
3098     switch (VA.getLocInfo()) {
3099     default: llvm_unreachable("Unknown loc info!");
3100     case CCValAssign::Full: break;
3101     case CCValAssign::SExt:
3102       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3103       break;
3104     case CCValAssign::ZExt:
3105       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3106       break;
3107     case CCValAssign::AExt:
3108       if (Arg.getValueType().isVector() &&
3109           Arg.getValueType().getVectorElementType() == MVT::i1)
3110         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3111       else if (RegVT.is128BitVector()) {
3112         // Special case: passing MMX values in XMM registers.
3113         Arg = DAG.getBitcast(MVT::i64, Arg);
3114         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3115         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3116       } else
3117         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3118       break;
3119     case CCValAssign::BCvt:
3120       Arg = DAG.getBitcast(RegVT, Arg);
3121       break;
3122     case CCValAssign::Indirect: {
3123       // Store the argument.
3124       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3125       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3126       Chain = DAG.getStore(
3127           Chain, dl, Arg, SpillSlot,
3128           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3129       Arg = SpillSlot;
3130       break;
3131     }
3132     }
3133
3134     if (VA.isRegLoc()) {
3135       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3136       if (isVarArg && IsWin64) {
3137         // Win64 ABI requires argument XMM reg to be copied to the corresponding
3138         // shadow reg if callee is a varargs function.
3139         unsigned ShadowReg = 0;
3140         switch (VA.getLocReg()) {
3141         case X86::XMM0: ShadowReg = X86::RCX; break;
3142         case X86::XMM1: ShadowReg = X86::RDX; break;
3143         case X86::XMM2: ShadowReg = X86::R8; break;
3144         case X86::XMM3: ShadowReg = X86::R9; break;
3145         }
3146         if (ShadowReg)
3147           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3148       }
3149     } else if (!IsSibcall && (!isTailCall || isByVal)) {
3150       assert(VA.isMemLoc());
3151       if (!StackPtr.getNode())
3152         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3153                                       getPointerTy(DAG.getDataLayout()));
3154       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3155                                              dl, DAG, VA, Flags));
3156     }
3157   }
3158
3159   if (!MemOpChains.empty())
3160     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3161
3162   if (Subtarget.isPICStyleGOT()) {
3163     // ELF / PIC requires GOT in the EBX register before function calls via PLT
3164     // GOT pointer.
3165     if (!isTailCall) {
3166       RegsToPass.push_back(std::make_pair(
3167           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3168                                           getPointerTy(DAG.getDataLayout()))));
3169     } else {
3170       // If we are tail calling and generating PIC/GOT style code load the
3171       // address of the callee into ECX. The value in ecx is used as target of
3172       // the tail jump. This is done to circumvent the ebx/callee-saved problem
3173       // for tail calls on PIC/GOT architectures. Normally we would just put the
3174       // address of GOT into ebx and then call target@PLT. But for tail calls
3175       // ebx would be restored (since ebx is callee saved) before jumping to the
3176       // target@PLT.
3177
3178       // Note: The actual moving to ECX is done further down.
3179       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3180       if (G && !G->getGlobal()->hasLocalLinkage() &&
3181           G->getGlobal()->hasDefaultVisibility())
3182         Callee = LowerGlobalAddress(Callee, DAG);
3183       else if (isa<ExternalSymbolSDNode>(Callee))
3184         Callee = LowerExternalSymbol(Callee, DAG);
3185     }
3186   }
3187
3188   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3189     // From AMD64 ABI document:
3190     // For calls that may call functions that use varargs or stdargs
3191     // (prototype-less calls or calls to functions containing ellipsis (...) in
3192     // the declaration) %al is used as hidden argument to specify the number
3193     // of SSE registers used. The contents of %al do not need to match exactly
3194     // the number of registers, but must be an ubound on the number of SSE
3195     // registers used and is in the range 0 - 8 inclusive.
3196
3197     // Count the number of XMM registers allocated.
3198     static const MCPhysReg XMMArgRegs[] = {
3199       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3200       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3201     };
3202     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3203     assert((Subtarget.hasSSE1() || !NumXMMRegs)
3204            && "SSE registers cannot be used when SSE is disabled");
3205
3206     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3207                                         DAG.getConstant(NumXMMRegs, dl,
3208                                                         MVT::i8)));
3209   }
3210
3211   if (isVarArg && IsMustTail) {
3212     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3213     for (const auto &F : Forwards) {
3214       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3215       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3216     }
3217   }
3218
3219   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3220   // don't need this because the eligibility check rejects calls that require
3221   // shuffling arguments passed in memory.
3222   if (!IsSibcall && isTailCall) {
3223     // Force all the incoming stack arguments to be loaded from the stack
3224     // before any new outgoing arguments are stored to the stack, because the
3225     // outgoing stack slots may alias the incoming argument stack slots, and
3226     // the alias isn't otherwise explicit. This is slightly more conservative
3227     // than necessary, because it means that each store effectively depends
3228     // on every argument instead of just those arguments it would clobber.
3229     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3230
3231     SmallVector<SDValue, 8> MemOpChains2;
3232     SDValue FIN;
3233     int FI = 0;
3234     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3235       CCValAssign &VA = ArgLocs[i];
3236       if (VA.isRegLoc())
3237         continue;
3238       assert(VA.isMemLoc());
3239       SDValue Arg = OutVals[i];
3240       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3241       // Skip inalloca arguments.  They don't require any work.
3242       if (Flags.isInAlloca())
3243         continue;
3244       // Create frame index.
3245       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3246       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3247       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3248       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3249
3250       if (Flags.isByVal()) {
3251         // Copy relative to framepointer.
3252         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3253         if (!StackPtr.getNode())
3254           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3255                                         getPointerTy(DAG.getDataLayout()));
3256         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3257                              StackPtr, Source);
3258
3259         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3260                                                          ArgChain,
3261                                                          Flags, DAG, dl));
3262       } else {
3263         // Store relative to framepointer.
3264         MemOpChains2.push_back(DAG.getStore(
3265             ArgChain, dl, Arg, FIN,
3266             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3267       }
3268     }
3269
3270     if (!MemOpChains2.empty())
3271       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3272
3273     // Store the return address to the appropriate stack slot.
3274     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3275                                      getPointerTy(DAG.getDataLayout()),
3276                                      RegInfo->getSlotSize(), FPDiff, dl);
3277   }
3278
3279   // Build a sequence of copy-to-reg nodes chained together with token chain
3280   // and flag operands which copy the outgoing args into registers.
3281   SDValue InFlag;
3282   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3283     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3284                              RegsToPass[i].second, InFlag);
3285     InFlag = Chain.getValue(1);
3286   }
3287
3288   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3289     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3290     // In the 64-bit large code model, we have to make all calls
3291     // through a register, since the call instruction's 32-bit
3292     // pc-relative offset may not be large enough to hold the whole
3293     // address.
3294   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3295     // If the callee is a GlobalAddress node (quite common, every direct call
3296     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3297     // it.
3298     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3299
3300     // We should use extra load for direct calls to dllimported functions in
3301     // non-JIT mode.
3302     const GlobalValue *GV = G->getGlobal();
3303     if (!GV->hasDLLImportStorageClass()) {
3304       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3305
3306       Callee = DAG.getTargetGlobalAddress(
3307           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3308
3309       if (OpFlags == X86II::MO_GOTPCREL) {
3310         // Add a wrapper.
3311         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3312           getPointerTy(DAG.getDataLayout()), Callee);
3313         // Add extra indirection
3314         Callee = DAG.getLoad(
3315             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3316             MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3317       }
3318     }
3319   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3320     const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3321     unsigned char OpFlags =
3322         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3323
3324     Callee = DAG.getTargetExternalSymbol(
3325         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3326   } else if (Subtarget.isTarget64BitILP32() &&
3327              Callee->getValueType(0) == MVT::i32) {
3328     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3329     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3330   }
3331
3332   // Returns a chain & a flag for retval copy to use.
3333   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3334   SmallVector<SDValue, 8> Ops;
3335
3336   if (!IsSibcall && isTailCall) {
3337     Chain = DAG.getCALLSEQ_END(Chain,
3338                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3339                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3340     InFlag = Chain.getValue(1);
3341   }
3342
3343   Ops.push_back(Chain);
3344   Ops.push_back(Callee);
3345
3346   if (isTailCall)
3347     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3348
3349   // Add argument registers to the end of the list so that they are known live
3350   // into the call.
3351   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3352     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3353                                   RegsToPass[i].second.getValueType()));
3354
3355   // Add a register mask operand representing the call-preserved registers.
3356   const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3357   assert(Mask && "Missing call preserved mask for calling convention");
3358
3359   // If this is an invoke in a 32-bit function using a funclet-based
3360   // personality, assume the function clobbers all registers. If an exception
3361   // is thrown, the runtime will not restore CSRs.
3362   // FIXME: Model this more precisely so that we can register allocate across
3363   // the normal edge and spill and fill across the exceptional edge.
3364   if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3365     const Function *CallerFn = MF.getFunction();
3366     EHPersonality Pers =
3367         CallerFn->hasPersonalityFn()
3368             ? classifyEHPersonality(CallerFn->getPersonalityFn())
3369             : EHPersonality::Unknown;
3370     if (isFuncletEHPersonality(Pers))
3371       Mask = RegInfo->getNoPreservedMask();
3372   }
3373
3374   Ops.push_back(DAG.getRegisterMask(Mask));
3375
3376   if (InFlag.getNode())
3377     Ops.push_back(InFlag);
3378
3379   if (isTailCall) {
3380     // We used to do:
3381     //// If this is the first return lowered for this function, add the regs
3382     //// to the liveout set for the function.
3383     // This isn't right, although it's probably harmless on x86; liveouts
3384     // should be computed from returns not tail calls.  Consider a void
3385     // function making a tail call to a function returning int.
3386     MF.getFrameInfo()->setHasTailCall();
3387     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3388   }
3389
3390   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3391   InFlag = Chain.getValue(1);
3392
3393   // Create the CALLSEQ_END node.
3394   unsigned NumBytesForCalleeToPop;
3395   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3396                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3397     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3398   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3399            !Subtarget.getTargetTriple().isOSMSVCRT() &&
3400            SR == StackStructReturn)
3401     // If this is a call to a struct-return function, the callee
3402     // pops the hidden struct pointer, so we have to push it back.
3403     // This is common for Darwin/X86, Linux & Mingw32 targets.
3404     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3405     NumBytesForCalleeToPop = 4;
3406   else
3407     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3408
3409   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3410     // No need to reset the stack after the call if the call doesn't return. To
3411     // make the MI verify, we'll pretend the callee does it for us.
3412     NumBytesForCalleeToPop = NumBytes;
3413   }
3414
3415   // Returns a flag for retval copy to use.
3416   if (!IsSibcall) {
3417     Chain = DAG.getCALLSEQ_END(Chain,
3418                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3419                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3420                                                      true),
3421                                InFlag, dl);
3422     InFlag = Chain.getValue(1);
3423   }
3424
3425   // Handle result values, copying them out of physregs into vregs that we
3426   // return.
3427   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3428                          Ins, dl, DAG, InVals);
3429 }
3430
3431 //===----------------------------------------------------------------------===//
3432 //                Fast Calling Convention (tail call) implementation
3433 //===----------------------------------------------------------------------===//
3434
3435 //  Like std call, callee cleans arguments, convention except that ECX is
3436 //  reserved for storing the tail called function address. Only 2 registers are
3437 //  free for argument passing (inreg). Tail call optimization is performed
3438 //  provided:
3439 //                * tailcallopt is enabled
3440 //                * caller/callee are fastcc
3441 //  On X86_64 architecture with GOT-style position independent code only local
3442 //  (within module) calls are supported at the moment.
3443 //  To keep the stack aligned according to platform abi the function
3444 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3445 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3446 //  If a tail called function callee has more arguments than the caller the
3447 //  caller needs to make sure that there is room to move the RETADDR to. This is
3448 //  achieved by reserving an area the size of the argument delta right after the
3449 //  original RETADDR, but before the saved framepointer or the spilled registers
3450 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3451 //  stack layout:
3452 //    arg1
3453 //    arg2
3454 //    RETADDR
3455 //    [ new RETADDR
3456 //      move area ]
3457 //    (possible EBP)
3458 //    ESI
3459 //    EDI
3460 //    local1 ..
3461
3462 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3463 /// requirement.
3464 unsigned
3465 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3466                                                SelectionDAG& DAG) const {
3467   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3468   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3469   unsigned StackAlignment = TFI.getStackAlignment();
3470   uint64_t AlignMask = StackAlignment - 1;
3471   int64_t Offset = StackSize;
3472   unsigned SlotSize = RegInfo->getSlotSize();
3473   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3474     // Number smaller than 12 so just add the difference.
3475     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3476   } else {
3477     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3478     Offset = ((~AlignMask) & Offset) + StackAlignment +
3479       (StackAlignment-SlotSize);
3480   }
3481   return Offset;
3482 }
3483
3484 /// Return true if the given stack call argument is already available in the
3485 /// same position (relatively) of the caller's incoming argument stack.
3486 static
3487 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3488                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3489                          const X86InstrInfo *TII, const CCValAssign &VA) {
3490   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3491
3492   for (;;) {
3493     // Look through nodes that don't alter the bits of the incoming value.
3494     unsigned Op = Arg.getOpcode();
3495     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3496       Arg = Arg.getOperand(0);
3497       continue;
3498     }
3499     if (Op == ISD::TRUNCATE) {
3500       const SDValue &TruncInput = Arg.getOperand(0);
3501       if (TruncInput.getOpcode() == ISD::AssertZext &&
3502           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3503               Arg.getValueType()) {
3504         Arg = TruncInput.getOperand(0);
3505         continue;
3506       }
3507     }
3508     break;
3509   }
3510
3511   int FI = INT_MAX;
3512   if (Arg.getOpcode() == ISD::CopyFromReg) {
3513     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3514     if (!TargetRegisterInfo::isVirtualRegister(VR))
3515       return false;
3516     MachineInstr *Def = MRI->getVRegDef(VR);
3517     if (!Def)
3518       return false;
3519     if (!Flags.isByVal()) {
3520       if (!TII->isLoadFromStackSlot(*Def, FI))
3521         return false;
3522     } else {
3523       unsigned Opcode = Def->getOpcode();
3524       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3525            Opcode == X86::LEA64_32r) &&
3526           Def->getOperand(1).isFI()) {
3527         FI = Def->getOperand(1).getIndex();
3528         Bytes = Flags.getByValSize();
3529       } else
3530         return false;
3531     }
3532   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3533     if (Flags.isByVal())
3534       // ByVal argument is passed in as a pointer but it's now being
3535       // dereferenced. e.g.
3536       // define @foo(%struct.X* %A) {
3537       //   tail call @bar(%struct.X* byval %A)
3538       // }
3539       return false;
3540     SDValue Ptr = Ld->getBasePtr();
3541     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3542     if (!FINode)
3543       return false;
3544     FI = FINode->getIndex();
3545   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3546     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3547     FI = FINode->getIndex();
3548     Bytes = Flags.getByValSize();
3549   } else
3550     return false;
3551
3552   assert(FI != INT_MAX);
3553   if (!MFI->isFixedObjectIndex(FI))
3554     return false;
3555
3556   if (Offset != MFI->getObjectOffset(FI))
3557     return false;
3558
3559   if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
3560     // If the argument location is wider than the argument type, check that any
3561     // extension flags match.
3562     if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
3563         Flags.isSExt() != MFI->isObjectSExt(FI)) {
3564       return false;
3565     }
3566   }
3567
3568   return Bytes == MFI->getObjectSize(FI);
3569 }
3570
3571 /// Check whether the call is eligible for tail call optimization. Targets
3572 /// that want to do tail call optimization should implement this function.
3573 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3574     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3575     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3576     const SmallVectorImpl<ISD::OutputArg> &Outs,
3577     const SmallVectorImpl<SDValue> &OutVals,
3578     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3579   if (!mayTailCallThisCC(CalleeCC))
3580     return false;
3581
3582   // If -tailcallopt is specified, make fastcc functions tail-callable.
3583   MachineFunction &MF = DAG.getMachineFunction();
3584   const Function *CallerF = MF.getFunction();
3585
3586   // If the function return type is x86_fp80 and the callee return type is not,
3587   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3588   // perform a tailcall optimization here.
3589   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3590     return false;
3591
3592   CallingConv::ID CallerCC = CallerF->getCallingConv();
3593   bool CCMatch = CallerCC == CalleeCC;
3594   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3595   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3596
3597   // Win64 functions have extra shadow space for argument homing. Don't do the
3598   // sibcall if the caller and callee have mismatched expectations for this
3599   // space.
3600   if (IsCalleeWin64 != IsCallerWin64)
3601     return false;
3602
3603   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3604     if (canGuaranteeTCO(CalleeCC) && CCMatch)
3605       return true;
3606     return false;
3607   }
3608
3609   // Look for obvious safe cases to perform tail call optimization that do not
3610   // require ABI changes. This is what gcc calls sibcall.
3611
3612   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3613   // emit a special epilogue.
3614   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3615   if (RegInfo->needsStackRealignment(MF))
3616     return false;
3617
3618   // Also avoid sibcall optimization if either caller or callee uses struct
3619   // return semantics.
3620   if (isCalleeStructRet || isCallerStructRet)
3621     return false;
3622
3623   // Do not sibcall optimize vararg calls unless all arguments are passed via
3624   // registers.
3625   LLVMContext &C = *DAG.getContext();
3626   if (isVarArg && !Outs.empty()) {
3627     // Optimizing for varargs on Win64 is unlikely to be safe without
3628     // additional testing.
3629     if (IsCalleeWin64 || IsCallerWin64)
3630       return false;
3631
3632     SmallVector<CCValAssign, 16> ArgLocs;
3633     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3634
3635     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3636     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3637       if (!ArgLocs[i].isRegLoc())
3638         return false;
3639   }
3640
3641   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3642   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3643   // this into a sibcall.
3644   bool Unused = false;
3645   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3646     if (!Ins[i].Used) {
3647       Unused = true;
3648       break;
3649     }
3650   }
3651   if (Unused) {
3652     SmallVector<CCValAssign, 16> RVLocs;
3653     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3654     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3655     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3656       CCValAssign &VA = RVLocs[i];
3657       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3658         return false;
3659     }
3660   }
3661
3662   // Check that the call results are passed in the same way.
3663   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3664                                   RetCC_X86, RetCC_X86))
3665     return false;
3666   // The callee has to preserve all registers the caller needs to preserve.
3667   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3668   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3669   if (!CCMatch) {
3670     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3671     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3672       return false;
3673   }
3674
3675   unsigned StackArgsSize = 0;
3676
3677   // If the callee takes no arguments then go on to check the results of the
3678   // call.
3679   if (!Outs.empty()) {
3680     // Check if stack adjustment is needed. For now, do not do this if any
3681     // argument is passed on the stack.
3682     SmallVector<CCValAssign, 16> ArgLocs;
3683     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3684
3685     // Allocate shadow area for Win64
3686     if (IsCalleeWin64)
3687       CCInfo.AllocateStack(32, 8);
3688
3689     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3690     StackArgsSize = CCInfo.getNextStackOffset();
3691
3692     if (CCInfo.getNextStackOffset()) {
3693       // Check if the arguments are already laid out in the right way as
3694       // the caller's fixed stack objects.
3695       MachineFrameInfo *MFI = MF.getFrameInfo();
3696       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3697       const X86InstrInfo *TII = Subtarget.getInstrInfo();
3698       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3699         CCValAssign &VA = ArgLocs[i];
3700         SDValue Arg = OutVals[i];
3701         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3702         if (VA.getLocInfo() == CCValAssign::Indirect)
3703           return false;
3704         if (!VA.isRegLoc()) {
3705           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3706                                    MFI, MRI, TII, VA))
3707             return false;
3708         }
3709       }
3710     }
3711
3712     bool PositionIndependent = isPositionIndependent();
3713     // If the tailcall address may be in a register, then make sure it's
3714     // possible to register allocate for it. In 32-bit, the call address can
3715     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3716     // callee-saved registers are restored. These happen to be the same
3717     // registers used to pass 'inreg' arguments so watch out for those.
3718     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
3719                                   !isa<ExternalSymbolSDNode>(Callee)) ||
3720                                  PositionIndependent)) {
3721       unsigned NumInRegs = 0;
3722       // In PIC we need an extra register to formulate the address computation
3723       // for the callee.
3724       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
3725
3726       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3727         CCValAssign &VA = ArgLocs[i];
3728         if (!VA.isRegLoc())
3729           continue;
3730         unsigned Reg = VA.getLocReg();
3731         switch (Reg) {
3732         default: break;
3733         case X86::EAX: case X86::EDX: case X86::ECX:
3734           if (++NumInRegs == MaxInRegs)
3735             return false;
3736           break;
3737         }
3738       }
3739     }
3740
3741     const MachineRegisterInfo &MRI = MF.getRegInfo();
3742     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3743       return false;
3744   }
3745
3746   bool CalleeWillPop =
3747       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
3748                        MF.getTarget().Options.GuaranteedTailCallOpt);
3749
3750   if (unsigned BytesToPop =
3751           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
3752     // If we have bytes to pop, the callee must pop them.
3753     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
3754     if (!CalleePopMatches)
3755       return false;
3756   } else if (CalleeWillPop && StackArgsSize > 0) {
3757     // If we don't have bytes to pop, make sure the callee doesn't pop any.
3758     return false;
3759   }
3760
3761   return true;
3762 }
3763
3764 FastISel *
3765 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3766                                   const TargetLibraryInfo *libInfo) const {
3767   return X86::createFastISel(funcInfo, libInfo);
3768 }
3769
3770 //===----------------------------------------------------------------------===//
3771 //                           Other Lowering Hooks
3772 //===----------------------------------------------------------------------===//
3773
3774 static bool MayFoldLoad(SDValue Op) {
3775   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3776 }
3777
3778 static bool MayFoldIntoStore(SDValue Op) {
3779   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3780 }
3781
3782 static bool isTargetShuffle(unsigned Opcode) {
3783   switch(Opcode) {
3784   default: return false;
3785   case X86ISD::BLENDI:
3786   case X86ISD::PSHUFB:
3787   case X86ISD::PSHUFD:
3788   case X86ISD::PSHUFHW:
3789   case X86ISD::PSHUFLW:
3790   case X86ISD::SHUFP:
3791   case X86ISD::INSERTPS:
3792   case X86ISD::PALIGNR:
3793   case X86ISD::VSHLDQ:
3794   case X86ISD::VSRLDQ:
3795   case X86ISD::MOVLHPS:
3796   case X86ISD::MOVLHPD:
3797   case X86ISD::MOVHLPS:
3798   case X86ISD::MOVLPS:
3799   case X86ISD::MOVLPD:
3800   case X86ISD::MOVSHDUP:
3801   case X86ISD::MOVSLDUP:
3802   case X86ISD::MOVDDUP:
3803   case X86ISD::MOVSS:
3804   case X86ISD::MOVSD:
3805   case X86ISD::UNPCKL:
3806   case X86ISD::UNPCKH:
3807   case X86ISD::VBROADCAST:
3808   case X86ISD::VPERMILPI:
3809   case X86ISD::VPERMILPV:
3810   case X86ISD::VPERM2X128:
3811   case X86ISD::VPERMIL2:
3812   case X86ISD::VPERMI:
3813   case X86ISD::VPPERM:
3814   case X86ISD::VPERMV:
3815   case X86ISD::VPERMV3:
3816   case X86ISD::VZEXT_MOVL:
3817     return true;
3818   }
3819 }
3820
3821 static bool isTargetShuffleVariableMask(unsigned Opcode) {
3822   switch (Opcode) {
3823   default: return false;
3824   case X86ISD::PSHUFB:
3825   case X86ISD::VPERMILPV:
3826     return true;
3827   }
3828 }
3829
3830 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3831                                     SDValue V1, unsigned TargetMask,
3832                                     SelectionDAG &DAG) {
3833   switch(Opc) {
3834   default: llvm_unreachable("Unknown x86 shuffle node");
3835   case X86ISD::PSHUFD:
3836   case X86ISD::PSHUFHW:
3837   case X86ISD::PSHUFLW:
3838   case X86ISD::VPERMILPI:
3839   case X86ISD::VPERMI:
3840     return DAG.getNode(Opc, dl, VT, V1,
3841                        DAG.getConstant(TargetMask, dl, MVT::i8));
3842   }
3843 }
3844
3845 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3846                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3847   switch(Opc) {
3848   default: llvm_unreachable("Unknown x86 shuffle node");
3849   case X86ISD::MOVLHPS:
3850   case X86ISD::MOVLHPD:
3851   case X86ISD::MOVHLPS:
3852   case X86ISD::MOVLPS:
3853   case X86ISD::MOVLPD:
3854   case X86ISD::MOVSS:
3855   case X86ISD::MOVSD:
3856   case X86ISD::UNPCKL:
3857   case X86ISD::UNPCKH:
3858     return DAG.getNode(Opc, dl, VT, V1, V2);
3859   }
3860 }
3861
3862 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3863   MachineFunction &MF = DAG.getMachineFunction();
3864   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3865   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3866   int ReturnAddrIndex = FuncInfo->getRAIndex();
3867
3868   if (ReturnAddrIndex == 0) {
3869     // Set up a frame object for the return address.
3870     unsigned SlotSize = RegInfo->getSlotSize();
3871     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3872                                                            -(int64_t)SlotSize,
3873                                                            false);
3874     FuncInfo->setRAIndex(ReturnAddrIndex);
3875   }
3876
3877   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
3878 }
3879
3880 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3881                                        bool hasSymbolicDisplacement) {
3882   // Offset should fit into 32 bit immediate field.
3883   if (!isInt<32>(Offset))
3884     return false;
3885
3886   // If we don't have a symbolic displacement - we don't have any extra
3887   // restrictions.
3888   if (!hasSymbolicDisplacement)
3889     return true;
3890
3891   // FIXME: Some tweaks might be needed for medium code model.
3892   if (M != CodeModel::Small && M != CodeModel::Kernel)
3893     return false;
3894
3895   // For small code model we assume that latest object is 16MB before end of 31
3896   // bits boundary. We may also accept pretty large negative constants knowing
3897   // that all objects are in the positive half of address space.
3898   if (M == CodeModel::Small && Offset < 16*1024*1024)
3899     return true;
3900
3901   // For kernel code model we know that all object resist in the negative half
3902   // of 32bits address space. We may not accept negative offsets, since they may
3903   // be just off and we may accept pretty large positive ones.
3904   if (M == CodeModel::Kernel && Offset >= 0)
3905     return true;
3906
3907   return false;
3908 }
3909
3910 /// Determines whether the callee is required to pop its own arguments.
3911 /// Callee pop is necessary to support tail calls.
3912 bool X86::isCalleePop(CallingConv::ID CallingConv,
3913                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
3914   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
3915   // can guarantee TCO.
3916   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
3917     return true;
3918
3919   switch (CallingConv) {
3920   default:
3921     return false;
3922   case CallingConv::X86_StdCall:
3923   case CallingConv::X86_FastCall:
3924   case CallingConv::X86_ThisCall:
3925   case CallingConv::X86_VectorCall:
3926     return !is64Bit;
3927   }
3928 }
3929
3930 /// \brief Return true if the condition is an unsigned comparison operation.
3931 static bool isX86CCUnsigned(unsigned X86CC) {
3932   switch (X86CC) {
3933   default:
3934     llvm_unreachable("Invalid integer condition!");
3935   case X86::COND_E:
3936   case X86::COND_NE:
3937   case X86::COND_B:
3938   case X86::COND_A:
3939   case X86::COND_BE:
3940   case X86::COND_AE:
3941     return true;
3942   case X86::COND_G:
3943   case X86::COND_GE:
3944   case X86::COND_L:
3945   case X86::COND_LE:
3946     return false;
3947   }
3948 }
3949
3950 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
3951   switch (SetCCOpcode) {
3952   default: llvm_unreachable("Invalid integer condition!");
3953   case ISD::SETEQ:  return X86::COND_E;
3954   case ISD::SETGT:  return X86::COND_G;
3955   case ISD::SETGE:  return X86::COND_GE;
3956   case ISD::SETLT:  return X86::COND_L;
3957   case ISD::SETLE:  return X86::COND_LE;
3958   case ISD::SETNE:  return X86::COND_NE;
3959   case ISD::SETULT: return X86::COND_B;
3960   case ISD::SETUGT: return X86::COND_A;
3961   case ISD::SETULE: return X86::COND_BE;
3962   case ISD::SETUGE: return X86::COND_AE;
3963   }
3964 }
3965
3966 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3967 /// condition code, returning the condition code and the LHS/RHS of the
3968 /// comparison to make.
3969 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
3970                                bool isFP, SDValue &LHS, SDValue &RHS,
3971                                SelectionDAG &DAG) {
3972   if (!isFP) {
3973     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3974       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3975         // X > -1   -> X == 0, jump !sign.
3976         RHS = DAG.getConstant(0, DL, RHS.getValueType());
3977         return X86::COND_NS;
3978       }
3979       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3980         // X < 0   -> X == 0, jump on sign.
3981         return X86::COND_S;
3982       }
3983       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3984         // X < 1   -> X <= 0
3985         RHS = DAG.getConstant(0, DL, RHS.getValueType());
3986         return X86::COND_LE;
3987       }
3988     }
3989
3990     return TranslateIntegerX86CC(SetCCOpcode);
3991   }
3992
3993   // First determine if it is required or is profitable to flip the operands.
3994
3995   // If LHS is a foldable load, but RHS is not, flip the condition.
3996   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3997       !ISD::isNON_EXTLoad(RHS.getNode())) {
3998     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3999     std::swap(LHS, RHS);
4000   }
4001
4002   switch (SetCCOpcode) {
4003   default: break;
4004   case ISD::SETOLT:
4005   case ISD::SETOLE:
4006   case ISD::SETUGT:
4007   case ISD::SETUGE:
4008     std::swap(LHS, RHS);
4009     break;
4010   }
4011
4012   // On a floating point condition, the flags are set as follows:
4013   // ZF  PF  CF   op
4014   //  0 | 0 | 0 | X > Y
4015   //  0 | 0 | 1 | X < Y
4016   //  1 | 0 | 0 | X == Y
4017   //  1 | 1 | 1 | unordered
4018   switch (SetCCOpcode) {
4019   default: llvm_unreachable("Condcode should be pre-legalized away");
4020   case ISD::SETUEQ:
4021   case ISD::SETEQ:   return X86::COND_E;
4022   case ISD::SETOLT:              // flipped
4023   case ISD::SETOGT:
4024   case ISD::SETGT:   return X86::COND_A;
4025   case ISD::SETOLE:              // flipped
4026   case ISD::SETOGE:
4027   case ISD::SETGE:   return X86::COND_AE;
4028   case ISD::SETUGT:              // flipped
4029   case ISD::SETULT:
4030   case ISD::SETLT:   return X86::COND_B;
4031   case ISD::SETUGE:              // flipped
4032   case ISD::SETULE:
4033   case ISD::SETLE:   return X86::COND_BE;
4034   case ISD::SETONE:
4035   case ISD::SETNE:   return X86::COND_NE;
4036   case ISD::SETUO:   return X86::COND_P;
4037   case ISD::SETO:    return X86::COND_NP;
4038   case ISD::SETOEQ:
4039   case ISD::SETUNE:  return X86::COND_INVALID;
4040   }
4041 }
4042
4043 /// Is there a floating point cmov for the specific X86 condition code?
4044 /// Current x86 isa includes the following FP cmov instructions:
4045 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4046 static bool hasFPCMov(unsigned X86CC) {
4047   switch (X86CC) {
4048   default:
4049     return false;
4050   case X86::COND_B:
4051   case X86::COND_BE:
4052   case X86::COND_E:
4053   case X86::COND_P:
4054   case X86::COND_A:
4055   case X86::COND_AE:
4056   case X86::COND_NE:
4057   case X86::COND_NP:
4058     return true;
4059   }
4060 }
4061
4062
4063 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4064                                            const CallInst &I,
4065                                            unsigned Intrinsic) const {
4066
4067   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4068   if (!IntrData)
4069     return false;
4070
4071   Info.opc = ISD::INTRINSIC_W_CHAIN;
4072   Info.readMem = false;
4073   Info.writeMem = false;
4074   Info.vol = false;
4075   Info.offset = 0;
4076
4077   switch (IntrData->Type) {
4078   case EXPAND_FROM_MEM: {
4079     Info.ptrVal = I.getArgOperand(0);
4080     Info.memVT = MVT::getVT(I.getType());
4081     Info.align = 1;
4082     Info.readMem = true;
4083     break;
4084   }
4085   case COMPRESS_TO_MEM: {
4086     Info.ptrVal = I.getArgOperand(0);
4087     Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4088     Info.align = 1;
4089     Info.writeMem = true;
4090     break;
4091   }
4092   case TRUNCATE_TO_MEM_VI8:
4093   case TRUNCATE_TO_MEM_VI16:
4094   case TRUNCATE_TO_MEM_VI32: {
4095     Info.ptrVal = I.getArgOperand(0);
4096     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
4097     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4098     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4099       ScalarVT = MVT::i8;
4100     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4101       ScalarVT = MVT::i16;
4102     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4103       ScalarVT = MVT::i32;
4104
4105     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4106     Info.align = 1;
4107     Info.writeMem = true;
4108     break;
4109   }
4110   default:
4111     return false;
4112   }
4113
4114   return true;
4115 }
4116
4117 /// Returns true if the target can instruction select the
4118 /// specified FP immediate natively. If false, the legalizer will
4119 /// materialize the FP immediate as a load from a constant pool.
4120 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4121   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4122     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4123       return true;
4124   }
4125   return false;
4126 }
4127
4128 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4129                                               ISD::LoadExtType ExtTy,
4130                                               EVT NewVT) const {
4131   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4132   // relocation target a movq or addq instruction: don't let the load shrink.
4133   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4134   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4135     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4136       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4137   return true;
4138 }
4139
4140 /// \brief Returns true if it is beneficial to convert a load of a constant
4141 /// to just the constant itself.
4142 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4143                                                           Type *Ty) const {
4144   assert(Ty->isIntegerTy());
4145
4146   unsigned BitSize = Ty->getPrimitiveSizeInBits();
4147   if (BitSize == 0 || BitSize > 64)
4148     return false;
4149   return true;
4150 }
4151
4152 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4153                                                 unsigned Index) const {
4154   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4155     return false;
4156
4157   return (Index == 0 || Index == ResVT.getVectorNumElements());
4158 }
4159
4160 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4161   // Speculate cttz only if we can directly use TZCNT.
4162   return Subtarget.hasBMI();
4163 }
4164
4165 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4166   // Speculate ctlz only if we can directly use LZCNT.
4167   return Subtarget.hasLZCNT();
4168 }
4169
4170 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4171   if (!Subtarget.hasBMI())
4172     return false;
4173
4174   // There are only 32-bit and 64-bit forms for 'andn'.
4175   EVT VT = Y.getValueType();
4176   if (VT != MVT::i32 && VT != MVT::i64)
4177     return false;
4178
4179   return true;
4180 }
4181
4182 /// Return true if every element in Mask, beginning
4183 /// from position Pos and ending in Pos+Size is undef.
4184 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4185   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4186     if (0 <= Mask[i])
4187       return false;
4188   return true;
4189 }
4190
4191 /// Return true if Val is undef or if its value falls within the
4192 /// specified range (L, H].
4193 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4194   return (Val < 0) || (Val >= Low && Val < Hi);
4195 }
4196
4197 /// Return true if every element in Mask is undef or if its value
4198 /// falls within the specified range (L, H].
4199 static bool isUndefOrInRange(ArrayRef<int> Mask,
4200                              int Low, int Hi) {
4201   for (int M : Mask)
4202     if (!isUndefOrInRange(M, Low, Hi))
4203       return false;
4204   return true;
4205 }
4206
4207 /// Val is either less than zero (undef) or equal to the specified value.
4208 static bool isUndefOrEqual(int Val, int CmpVal) {
4209   return (Val < 0 || Val == CmpVal);
4210 }
4211
4212 /// Val is either the undef or zero sentinel value.
4213 static bool isUndefOrZero(int Val) {
4214   return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
4215 }
4216
4217 /// Return true if every element in Mask, beginning
4218 /// from position Pos and ending in Pos+Size, falls within the specified
4219 /// sequential range (Low, Low+Size]. or is undef.
4220 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4221                                        unsigned Pos, unsigned Size, int Low) {
4222   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4223     if (!isUndefOrEqual(Mask[i], Low))
4224       return false;
4225   return true;
4226 }
4227
4228 /// Return true if every element in Mask, beginning
4229 /// from position Pos and ending in Pos+Size, falls within the specified
4230 /// sequential range (Low, Low+Size], or is undef or is zero.
4231 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4232                                              unsigned Size, int Low) {
4233   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4234     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4235       return false;
4236   return true;
4237 }
4238
4239 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4240 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4241 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4242   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4243   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4244     return false;
4245
4246   // The index should be aligned on a vecWidth-bit boundary.
4247   uint64_t Index =
4248     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4249
4250   MVT VT = N->getSimpleValueType(0);
4251   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4252   bool Result = (Index * ElSize) % vecWidth == 0;
4253
4254   return Result;
4255 }
4256
4257 /// Return true if the specified INSERT_SUBVECTOR
4258 /// operand specifies a subvector insert that is suitable for input to
4259 /// insertion of 128 or 256-bit subvectors
4260 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4261   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4262   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4263     return false;
4264   // The index should be aligned on a vecWidth-bit boundary.
4265   uint64_t Index =
4266     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4267
4268   MVT VT = N->getSimpleValueType(0);
4269   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4270   bool Result = (Index * ElSize) % vecWidth == 0;
4271
4272   return Result;
4273 }
4274
4275 bool X86::isVINSERT128Index(SDNode *N) {
4276   return isVINSERTIndex(N, 128);
4277 }
4278
4279 bool X86::isVINSERT256Index(SDNode *N) {
4280   return isVINSERTIndex(N, 256);
4281 }
4282
4283 bool X86::isVEXTRACT128Index(SDNode *N) {
4284   return isVEXTRACTIndex(N, 128);
4285 }
4286
4287 bool X86::isVEXTRACT256Index(SDNode *N) {
4288   return isVEXTRACTIndex(N, 256);
4289 }
4290
4291 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4292   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4293   assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4294          "Illegal extract subvector for VEXTRACT");
4295
4296   uint64_t Index =
4297     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4298
4299   MVT VecVT = N->getOperand(0).getSimpleValueType();
4300   MVT ElVT = VecVT.getVectorElementType();
4301
4302   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4303   return Index / NumElemsPerChunk;
4304 }
4305
4306 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4307   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4308   assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4309          "Illegal insert subvector for VINSERT");
4310
4311   uint64_t Index =
4312     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4313
4314   MVT VecVT = N->getSimpleValueType(0);
4315   MVT ElVT = VecVT.getVectorElementType();
4316
4317   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4318   return Index / NumElemsPerChunk;
4319 }
4320
4321 /// Return the appropriate immediate to extract the specified
4322 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4323 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4324   return getExtractVEXTRACTImmediate(N, 128);
4325 }
4326
4327 /// Return the appropriate immediate to extract the specified
4328 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4329 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4330   return getExtractVEXTRACTImmediate(N, 256);
4331 }
4332
4333 /// Return the appropriate immediate to insert at the specified
4334 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4335 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4336   return getInsertVINSERTImmediate(N, 128);
4337 }
4338
4339 /// Return the appropriate immediate to insert at the specified
4340 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4341 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4342   return getInsertVINSERTImmediate(N, 256);
4343 }
4344
4345 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4346 bool X86::isZeroNode(SDValue Elt) {
4347   return isNullConstant(Elt) || isNullFPConstant(Elt);
4348 }
4349
4350 // Build a vector of constants
4351 // Use an UNDEF node if MaskElt == -1.
4352 // Spilt 64-bit constants in the 32-bit mode.
4353 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4354                               const SDLoc &dl, bool IsMask = false) {
4355
4356   SmallVector<SDValue, 32>  Ops;
4357   bool Split = false;
4358
4359   MVT ConstVecVT = VT;
4360   unsigned NumElts = VT.getVectorNumElements();
4361   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4362   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4363     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4364     Split = true;
4365   }
4366
4367   MVT EltVT = ConstVecVT.getVectorElementType();
4368   for (unsigned i = 0; i < NumElts; ++i) {
4369     bool IsUndef = Values[i] < 0 && IsMask;
4370     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4371       DAG.getConstant(Values[i], dl, EltVT);
4372     Ops.push_back(OpNode);
4373     if (Split)
4374       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4375                     DAG.getConstant(0, dl, EltVT));
4376   }
4377   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4378   if (Split)
4379     ConstsNode = DAG.getBitcast(VT, ConstsNode);
4380   return ConstsNode;
4381 }
4382
4383 /// Returns a vector of specified type with all zero elements.
4384 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4385                              SelectionDAG &DAG, const SDLoc &dl) {
4386   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4387           VT.getVectorElementType() == MVT::i1) &&
4388          "Unexpected vector type");
4389
4390   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4391   // type. This ensures they get CSE'd. But if the integer type is not
4392   // available, use a floating-point +0.0 instead.
4393   SDValue Vec;
4394   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4395     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4396   } else if (VT.getVectorElementType() == MVT::i1) {
4397     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4398            "Unexpected vector type");
4399     assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4400            "Unexpected vector type");
4401     Vec = DAG.getConstant(0, dl, VT);
4402   } else {
4403     unsigned Num32BitElts = VT.getSizeInBits() / 32;
4404     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4405   }
4406   return DAG.getBitcast(VT, Vec);
4407 }
4408
4409 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4410                                 const SDLoc &dl, unsigned vectorWidth) {
4411   assert((vectorWidth == 128 || vectorWidth == 256) &&
4412          "Unsupported vector width");
4413   EVT VT = Vec.getValueType();
4414   EVT ElVT = VT.getVectorElementType();
4415   unsigned Factor = VT.getSizeInBits()/vectorWidth;
4416   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4417                                   VT.getVectorNumElements()/Factor);
4418
4419   // Extract from UNDEF is UNDEF.
4420   if (Vec.isUndef())
4421     return DAG.getUNDEF(ResultVT);
4422
4423   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
4424   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4425   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4426
4427   // This is the index of the first element of the vectorWidth-bit chunk
4428   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4429   IdxVal &= ~(ElemsPerChunk - 1);
4430
4431   // If the input is a buildvector just emit a smaller one.
4432   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4433     return DAG.getNode(ISD::BUILD_VECTOR,
4434          dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4435
4436   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4437   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4438 }
4439
4440 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
4441 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4442 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4443 /// instructions or a simple subregister reference. Idx is an index in the
4444 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
4445 /// lowering EXTRACT_VECTOR_ELT operations easier.
4446 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4447                                    SelectionDAG &DAG, const SDLoc &dl) {
4448   assert((Vec.getValueType().is256BitVector() ||
4449           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4450   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4451 }
4452
4453 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4454 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4455                                    SelectionDAG &DAG, const SDLoc &dl) {
4456   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4457   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4458 }
4459
4460 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4461                                SelectionDAG &DAG, const SDLoc &dl,
4462                                unsigned vectorWidth) {
4463   assert((vectorWidth == 128 || vectorWidth == 256) &&
4464          "Unsupported vector width");
4465   // Inserting UNDEF is Result
4466   if (Vec.isUndef())
4467     return Result;
4468   EVT VT = Vec.getValueType();
4469   EVT ElVT = VT.getVectorElementType();
4470   EVT ResultVT = Result.getValueType();
4471
4472   // Insert the relevant vectorWidth bits.
4473   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4474   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4475
4476   // This is the index of the first element of the vectorWidth-bit chunk
4477   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4478   IdxVal &= ~(ElemsPerChunk - 1);
4479
4480   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4481   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4482 }
4483
4484 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
4485 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4486 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4487 /// simple superregister reference.  Idx is an index in the 128 bits
4488 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
4489 /// lowering INSERT_VECTOR_ELT operations easier.
4490 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4491                                   SelectionDAG &DAG, const SDLoc &dl) {
4492   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4493
4494   // For insertion into the zero index (low half) of a 256-bit vector, it is
4495   // more efficient to generate a blend with immediate instead of an insert*128.
4496   // We are still creating an INSERT_SUBVECTOR below with an undef node to
4497   // extend the subvector to the size of the result vector. Make sure that
4498   // we are not recursing on that node by checking for undef here.
4499   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4500       !Result.isUndef()) {
4501     EVT ResultVT = Result.getValueType();
4502     SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4503     SDValue Undef = DAG.getUNDEF(ResultVT);
4504     SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4505                                  Vec, ZeroIndex);
4506
4507     // The blend instruction, and therefore its mask, depend on the data type.
4508     MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4509     if (ScalarType.isFloatingPoint()) {
4510       // Choose either vblendps (float) or vblendpd (double).
4511       unsigned ScalarSize = ScalarType.getSizeInBits();
4512       assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4513       unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4514       SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4515       return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4516     }
4517
4518     const X86Subtarget &Subtarget =
4519     static_cast<const X86Subtarget &>(DAG.getSubtarget());
4520
4521     // AVX2 is needed for 256-bit integer blend support.
4522     // Integers must be cast to 32-bit because there is only vpblendd;
4523     // vpblendw can't be used for this because it has a handicapped mask.
4524
4525     // If we don't have AVX2, then cast to float. Using a wrong domain blend
4526     // is still more efficient than using the wrong domain vinsertf128 that
4527     // will be created by InsertSubVector().
4528     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4529
4530     SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4531     Result = DAG.getBitcast(CastVT, Result);
4532     Vec256 = DAG.getBitcast(CastVT, Vec256);
4533     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4534     return DAG.getBitcast(ResultVT, Vec256);
4535   }
4536
4537   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4538 }
4539
4540 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4541                                   SelectionDAG &DAG, const SDLoc &dl) {
4542   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4543   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4544 }
4545
4546 /// Insert i1-subvector to i1-vector.
4547 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4548                                 const X86Subtarget &Subtarget) {
4549
4550   SDLoc dl(Op);
4551   SDValue Vec = Op.getOperand(0);
4552   SDValue SubVec = Op.getOperand(1);
4553   SDValue Idx = Op.getOperand(2);
4554
4555   if (!isa<ConstantSDNode>(Idx))
4556     return SDValue();
4557
4558   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4559   if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
4560     return Op;
4561
4562   MVT OpVT = Op.getSimpleValueType();
4563   MVT SubVecVT = SubVec.getSimpleValueType();
4564   unsigned NumElems = OpVT.getVectorNumElements();
4565   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4566
4567   assert(IdxVal + SubVecNumElems <= NumElems &&
4568          IdxVal % SubVecVT.getSizeInBits() == 0 &&
4569          "Unexpected index value in INSERT_SUBVECTOR");
4570
4571   // There are 3 possible cases:
4572   // 1. Subvector should be inserted in the lower part (IdxVal == 0)
4573   // 2. Subvector should be inserted in the upper part
4574   //    (IdxVal + SubVecNumElems == NumElems)
4575   // 3. Subvector should be inserted in the middle (for example v2i1
4576   //    to v16i1, index 2)
4577
4578   // extend to natively supported kshift
4579   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4580   MVT WideOpVT = OpVT;
4581   if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
4582     WideOpVT = MinVT;
4583
4584   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4585   SDValue Undef = DAG.getUNDEF(WideOpVT);
4586   SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4587                                    Undef, SubVec, ZeroIdx);
4588
4589   // Extract sub-vector if require.
4590   auto ExtractSubVec = [&](SDValue V) {
4591     return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
4592                                                 OpVT, V, ZeroIdx);
4593   };
4594
4595   if (Vec.isUndef()) {
4596     if (IdxVal != 0) {
4597       SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
4598       WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
4599     }
4600     return ExtractSubVec(WideSubVec);
4601   }
4602
4603   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
4604     NumElems = WideOpVT.getVectorNumElements();
4605     unsigned ShiftLeft = NumElems - SubVecNumElems;
4606     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4607     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4608                              DAG.getConstant(ShiftLeft, dl, MVT::i8));
4609     Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
4610       DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
4611     return ExtractSubVec(Vec);
4612   }
4613
4614   if (IdxVal == 0) {
4615     // Zero lower bits of the Vec
4616     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4617     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4618     Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4619     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4620     // Merge them together, SubVec should be zero extended.
4621     WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4622                              getZeroVector(WideOpVT, Subtarget, DAG, dl),
4623                              SubVec, ZeroIdx);
4624     Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4625     return ExtractSubVec(Vec);
4626   }
4627
4628   // Simple case when we put subvector in the upper part
4629   if (IdxVal + SubVecNumElems == NumElems) {
4630     // Zero upper bits of the Vec
4631     WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4632                              DAG.getConstant(IdxVal, dl, MVT::i8));
4633     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4634     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4635     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4636     Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4637     Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4638     return ExtractSubVec(Vec);
4639   }
4640   // Subvector should be inserted in the middle - use shuffle
4641   WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
4642                            SubVec, ZeroIdx);
4643   SmallVector<int, 64> Mask;
4644   for (unsigned i = 0; i < NumElems; ++i)
4645     Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
4646                     i : i + NumElems);
4647   return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
4648 }
4649
4650 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
4651 /// instructions. This is used because creating CONCAT_VECTOR nodes of
4652 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
4653 /// large BUILD_VECTORS.
4654 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
4655                                    unsigned NumElems, SelectionDAG &DAG,
4656                                    const SDLoc &dl) {
4657   SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4658   return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
4659 }
4660
4661 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
4662                                    unsigned NumElems, SelectionDAG &DAG,
4663                                    const SDLoc &dl) {
4664   SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4665   return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
4666 }
4667
4668 /// Returns a vector of specified type with all bits set.
4669 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4670 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
4671 /// Then bitcast to their original type, ensuring they get CSE'd.
4672 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
4673                              SelectionDAG &DAG, const SDLoc &dl) {
4674   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4675          "Expected a 128/256/512-bit vector type");
4676
4677   APInt Ones = APInt::getAllOnesValue(32);
4678   unsigned NumElts = VT.getSizeInBits() / 32;
4679   SDValue Vec;
4680   if (!Subtarget.hasInt256() && NumElts == 8) {
4681     Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
4682     Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4683   } else {
4684     Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
4685   }
4686   return DAG.getBitcast(VT, Vec);
4687 }
4688
4689 /// Returns a vector_shuffle node for an unpackl operation.
4690 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4691                           SDValue V1, SDValue V2) {
4692   assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4693   unsigned NumElems = VT.getVectorNumElements();
4694   SmallVector<int, 8> Mask(NumElems);
4695   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4696     Mask[i * 2]     = i;
4697     Mask[i * 2 + 1] = i + NumElems;
4698   }
4699   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4700 }
4701
4702 /// Returns a vector_shuffle node for an unpackh operation.
4703 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4704                           SDValue V1, SDValue V2) {
4705   assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4706   unsigned NumElems = VT.getVectorNumElements();
4707   SmallVector<int, 8> Mask(NumElems);
4708   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4709     Mask[i * 2]     = i + Half;
4710     Mask[i * 2 + 1] = i + NumElems + Half;
4711   }
4712   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4713 }
4714
4715 /// Return a vector_shuffle of the specified vector of zero or undef vector.
4716 /// This produces a shuffle where the low element of V2 is swizzled into the
4717 /// zero/undef vector, landing at element Idx.
4718 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4719 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
4720                                            bool IsZero,
4721                                            const X86Subtarget &Subtarget,
4722                                            SelectionDAG &DAG) {
4723   MVT VT = V2.getSimpleValueType();
4724   SDValue V1 = IsZero
4725     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4726   int NumElems = VT.getVectorNumElements();
4727   SmallVector<int, 16> MaskVec(NumElems);
4728   for (int i = 0; i != NumElems; ++i)
4729     // If this is the insertion idx, put the low elt of V2 here.
4730     MaskVec[i] = (i == Idx) ? NumElems : i;
4731   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4732 }
4733
4734 static SDValue peekThroughBitcasts(SDValue V) {
4735   while (V.getNode() && V.getOpcode() == ISD::BITCAST)
4736     V = V.getOperand(0);
4737   return V;
4738 }
4739
4740 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
4741                                         unsigned MaskEltSizeInBits,
4742                                         SmallVectorImpl<uint64_t> &RawMask) {
4743   MaskNode = peekThroughBitcasts(MaskNode);
4744
4745   MVT VT = MaskNode.getSimpleValueType();
4746   assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
4747
4748   // Split an APInt element into MaskEltSizeInBits sized pieces and
4749   // insert into the shuffle mask.
4750   auto SplitElementToMask = [&](APInt Element) {
4751     // Note that this is x86 and so always little endian: the low byte is
4752     // the first byte of the mask.
4753     int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4754     for (int i = 0; i < Split; ++i) {
4755       APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
4756       Element = Element.lshr(MaskEltSizeInBits);
4757       RawMask.push_back(RawElt.getZExtValue());
4758     }
4759   };
4760
4761   if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
4762     // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4763     // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
4764     if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
4765       return false;
4766     if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
4767       const APInt &MaskElement = CN->getAPIntValue();
4768       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
4769         APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
4770         RawMask.push_back(RawElt.getZExtValue());
4771       }
4772     }
4773     return false;
4774   }
4775
4776   if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
4777       MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
4778
4779     // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4780     if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4781       return false;
4782     unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4783
4784     SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
4785     if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
4786       SplitElementToMask(CN->getAPIntValue());
4787       RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
4788       return true;
4789     }
4790     return false;
4791   }
4792
4793   if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
4794     return false;
4795
4796   // We can always decode if the buildvector is all zero constants,
4797   // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
4798   if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
4799     RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
4800     return true;
4801   }
4802
4803   // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4804   if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4805     return false;
4806
4807   for (SDValue Op : MaskNode->ops()) {
4808     if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
4809       SplitElementToMask(CN->getAPIntValue());
4810     else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
4811       SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
4812     else
4813       return false;
4814   }
4815
4816   return true;
4817 }
4818
4819 static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
4820   MaskNode = peekThroughBitcasts(MaskNode);
4821
4822   auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
4823   if (!MaskLoad)
4824     return nullptr;
4825
4826   SDValue Ptr = MaskLoad->getBasePtr();
4827   if (Ptr->getOpcode() == X86ISD::Wrapper ||
4828       Ptr->getOpcode() == X86ISD::WrapperRIP)
4829     Ptr = Ptr->getOperand(0);
4830
4831   auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
4832   if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
4833     return nullptr;
4834
4835   return dyn_cast<Constant>(MaskCP->getConstVal());
4836 }
4837
4838 /// Calculates the shuffle mask corresponding to the target-specific opcode.
4839 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
4840 /// operands in \p Ops, and returns true.
4841 /// Sets \p IsUnary to true if only one source is used. Note that this will set
4842 /// IsUnary for shuffles which use a single input multiple times, and in those
4843 /// cases it will adjust the mask to only have indices within that single input.
4844 /// It is an error to call this with non-empty Mask/Ops vectors.
4845 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
4846                                  SmallVectorImpl<SDValue> &Ops,
4847                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
4848   unsigned NumElems = VT.getVectorNumElements();
4849   SDValue ImmN;
4850
4851   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
4852   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
4853
4854   IsUnary = false;
4855   bool IsFakeUnary = false;
4856   switch(N->getOpcode()) {
4857   case X86ISD::BLENDI:
4858     ImmN = N->getOperand(N->getNumOperands()-1);
4859     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4860     break;
4861   case X86ISD::SHUFP:
4862     ImmN = N->getOperand(N->getNumOperands()-1);
4863     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4864     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4865     break;
4866   case X86ISD::INSERTPS:
4867     ImmN = N->getOperand(N->getNumOperands()-1);
4868     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4869     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4870     break;
4871   case X86ISD::UNPCKH:
4872     DecodeUNPCKHMask(VT, Mask);
4873     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4874     break;
4875   case X86ISD::UNPCKL:
4876     DecodeUNPCKLMask(VT, Mask);
4877     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4878     break;
4879   case X86ISD::MOVHLPS:
4880     DecodeMOVHLPSMask(NumElems, Mask);
4881     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4882     break;
4883   case X86ISD::MOVLHPS:
4884     DecodeMOVLHPSMask(NumElems, Mask);
4885     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4886     break;
4887   case X86ISD::PALIGNR:
4888     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4889     ImmN = N->getOperand(N->getNumOperands()-1);
4890     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4891     break;
4892   case X86ISD::VSHLDQ:
4893     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4894     ImmN = N->getOperand(N->getNumOperands() - 1);
4895     DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4896     IsUnary = true;
4897     break;
4898   case X86ISD::VSRLDQ:
4899     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4900     ImmN = N->getOperand(N->getNumOperands() - 1);
4901     DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4902     IsUnary = true;
4903     break;
4904   case X86ISD::PSHUFD:
4905   case X86ISD::VPERMILPI:
4906     ImmN = N->getOperand(N->getNumOperands()-1);
4907     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4908     IsUnary = true;
4909     break;
4910   case X86ISD::PSHUFHW:
4911     ImmN = N->getOperand(N->getNumOperands()-1);
4912     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4913     IsUnary = true;
4914     break;
4915   case X86ISD::PSHUFLW:
4916     ImmN = N->getOperand(N->getNumOperands()-1);
4917     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4918     IsUnary = true;
4919     break;
4920   case X86ISD::VZEXT_MOVL:
4921     DecodeZeroMoveLowMask(VT, Mask);
4922     IsUnary = true;
4923     break;
4924   case X86ISD::VBROADCAST: {
4925     // We only decode broadcasts of same-sized vectors at the moment.
4926     if (N->getOperand(0).getValueType() == VT) {
4927       DecodeVectorBroadcast(VT, Mask);
4928       IsUnary = true;
4929       break;
4930     }
4931     return false;
4932   }
4933   case X86ISD::VPERMILPV: {
4934     IsUnary = true;
4935     SDValue MaskNode = N->getOperand(1);
4936     unsigned MaskEltSize = VT.getScalarSizeInBits();
4937     SmallVector<uint64_t, 32> RawMask;
4938     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
4939       DecodeVPERMILPMask(VT, RawMask, Mask);
4940       break;
4941     }
4942     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4943       DecodeVPERMILPMask(C, MaskEltSize, Mask);
4944       break;
4945     }
4946     return false;
4947   }
4948   case X86ISD::PSHUFB: {
4949     IsUnary = true;
4950     SDValue MaskNode = N->getOperand(1);
4951     SmallVector<uint64_t, 32> RawMask;
4952     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
4953       DecodePSHUFBMask(RawMask, Mask);
4954       break;
4955     }
4956     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4957       DecodePSHUFBMask(C, Mask);
4958       break;
4959     }
4960     return false;
4961   }
4962   case X86ISD::VPERMI:
4963     ImmN = N->getOperand(N->getNumOperands()-1);
4964     DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4965     IsUnary = true;
4966     break;
4967   case X86ISD::MOVSS:
4968   case X86ISD::MOVSD:
4969     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
4970     break;
4971   case X86ISD::VPERM2X128:
4972     ImmN = N->getOperand(N->getNumOperands()-1);
4973     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4974     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4975     break;
4976   case X86ISD::MOVSLDUP:
4977     DecodeMOVSLDUPMask(VT, Mask);
4978     IsUnary = true;
4979     break;
4980   case X86ISD::MOVSHDUP:
4981     DecodeMOVSHDUPMask(VT, Mask);
4982     IsUnary = true;
4983     break;
4984   case X86ISD::MOVDDUP:
4985     DecodeMOVDDUPMask(VT, Mask);
4986     IsUnary = true;
4987     break;
4988   case X86ISD::MOVLHPD:
4989   case X86ISD::MOVLPD:
4990   case X86ISD::MOVLPS:
4991     // Not yet implemented
4992     return false;
4993   case X86ISD::VPERMIL2: {
4994     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4995     unsigned MaskEltSize = VT.getScalarSizeInBits();
4996     SDValue MaskNode = N->getOperand(2);
4997     SDValue CtrlNode = N->getOperand(3);
4998     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
4999       unsigned CtrlImm = CtrlOp->getZExtValue();
5000       SmallVector<uint64_t, 32> RawMask;
5001       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5002         DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5003         break;
5004       }
5005       if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5006         DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5007         break;
5008       }
5009     }
5010     return false;
5011   }
5012   case X86ISD::VPPERM: {
5013     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5014     SDValue MaskNode = N->getOperand(2);
5015     SmallVector<uint64_t, 32> RawMask;
5016     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5017       DecodeVPPERMMask(RawMask, Mask);
5018       break;
5019     }
5020     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5021       DecodeVPPERMMask(C, Mask);
5022       break;
5023     }
5024     return false;
5025   }
5026   case X86ISD::VPERMV: {
5027     IsUnary = true;
5028     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5029     Ops.push_back(N->getOperand(1));
5030     SDValue MaskNode = N->getOperand(0);
5031     SmallVector<uint64_t, 32> RawMask;
5032     unsigned MaskEltSize = VT.getScalarSizeInBits();
5033     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5034       DecodeVPERMVMask(RawMask, Mask);
5035       break;
5036     }
5037     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5038       DecodeVPERMVMask(C, VT, Mask);
5039       break;
5040     }
5041     return false;
5042   }
5043   case X86ISD::VPERMV3: {
5044     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5045     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5046     Ops.push_back(N->getOperand(0));
5047     Ops.push_back(N->getOperand(2));
5048     SDValue MaskNode = N->getOperand(1);
5049     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5050       DecodeVPERMV3Mask(C, VT, Mask);
5051       break;
5052     }
5053     return false;
5054   }
5055   default: llvm_unreachable("unknown target shuffle node");
5056   }
5057
5058   // Empty mask indicates the decode failed.
5059   if (Mask.empty())
5060     return false;
5061
5062   // Check if we're getting a shuffle mask with zero'd elements.
5063   if (!AllowSentinelZero)
5064     if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5065       return false;
5066
5067   // If we have a fake unary shuffle, the shuffle mask is spread across two
5068   // inputs that are actually the same node. Re-map the mask to always point
5069   // into the first input.
5070   if (IsFakeUnary)
5071     for (int &M : Mask)
5072       if (M >= (int)Mask.size())
5073         M -= Mask.size();
5074
5075   // If we didn't already add operands in the opcode-specific code, default to
5076   // adding 1 or 2 operands starting at 0.
5077   if (Ops.empty()) {
5078     Ops.push_back(N->getOperand(0));
5079     if (!IsUnary || IsFakeUnary)
5080       Ops.push_back(N->getOperand(1));
5081   }
5082
5083   return true;
5084 }
5085
5086 /// Check a target shuffle mask's inputs to see if we can set any values to
5087 /// SM_SentinelZero - this is for elements that are known to be zero
5088 /// (not just zeroable) from their inputs.
5089 /// Returns true if the target shuffle mask was decoded.
5090 static bool setTargetShuffleZeroElements(SDValue N,
5091                                          SmallVectorImpl<int> &Mask,
5092                                          SmallVectorImpl<SDValue> &Ops) {
5093   bool IsUnary;
5094   if (!isTargetShuffle(N.getOpcode()))
5095     return false;
5096   if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
5097                             Mask, IsUnary))
5098     return false;
5099
5100   SDValue V1 = Ops[0];
5101   SDValue V2 = IsUnary ? V1 : Ops[1];
5102
5103   V1 = peekThroughBitcasts(V1);
5104   V2 = peekThroughBitcasts(V2);
5105
5106   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5107     int M = Mask[i];
5108
5109     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5110     if (M < 0)
5111       continue;
5112
5113     // Determine shuffle input and normalize the mask.
5114     SDValue V = M < Size ? V1 : V2;
5115     M %= Size;
5116
5117     // We are referencing an UNDEF input.
5118     if (V.isUndef()) {
5119       Mask[i] = SM_SentinelUndef;
5120       continue;
5121     }
5122
5123     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5124     if (V.getOpcode() != ISD::BUILD_VECTOR)
5125       continue;
5126
5127     // If the BUILD_VECTOR has fewer elements then the (larger) source
5128     // element must be UNDEF/ZERO.
5129     // TODO: Is it worth testing the individual bits of a constant?
5130     if ((Size % V.getNumOperands()) == 0) {
5131       int Scale = Size / V->getNumOperands();
5132       SDValue Op = V.getOperand(M / Scale);
5133       if (Op.isUndef())
5134         Mask[i] = SM_SentinelUndef;
5135       else if (X86::isZeroNode(Op))
5136         Mask[i] = SM_SentinelZero;
5137       continue;
5138     }
5139
5140     // If the BUILD_VECTOR has more elements then all the (smaller) source
5141     // elements must be all UNDEF or all ZERO.
5142     if ((V.getNumOperands() % Size) == 0) {
5143       int Scale = V->getNumOperands() / Size;
5144       bool AllUndef = true;
5145       bool AllZero = true;
5146       for (int j = 0; j < Scale; ++j) {
5147         SDValue Op = V.getOperand((M * Scale) + j);
5148         AllUndef &= Op.isUndef();
5149         AllZero &= X86::isZeroNode(Op);
5150       }
5151       if (AllUndef)
5152         Mask[i] = SM_SentinelUndef;
5153       else if (AllZero)
5154         Mask[i] = SM_SentinelZero;
5155       continue;
5156     }
5157   }
5158
5159   return true;
5160 }
5161
5162 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5163 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5164 /// remaining input indices in case we now have a unary shuffle and adjust the
5165 /// Op0/Op1 inputs accordingly.
5166 /// Returns true if the target shuffle mask was decoded.
5167 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5168                                        SmallVectorImpl<int> &Mask) {
5169   SmallVector<SDValue, 2> Ops;
5170   if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5171     return false;
5172
5173   int NumElts = Mask.size();
5174   bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
5175     return 0 <= Idx && Idx < NumElts;
5176   });
5177   bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
5178                               [NumElts](int Idx) { return NumElts <= Idx; });
5179
5180   Op0 = Op0InUse ? Ops[0] : SDValue();
5181   Op1 = Op1InUse ? Ops[1] : SDValue();
5182
5183   // We're only using Op1 - commute the mask and inputs.
5184   if (!Op0InUse && Op1InUse) {
5185     for (int &M : Mask)
5186       if (NumElts <= M)
5187         M -= NumElts;
5188     Op0 = Op1;
5189     Op1 = SDValue();
5190   }
5191
5192   return true;
5193 }
5194
5195 /// Returns the scalar element that will make up the ith
5196 /// element of the result of the vector shuffle.
5197 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5198                                    unsigned Depth) {
5199   if (Depth == 6)
5200     return SDValue();  // Limit search depth.
5201
5202   SDValue V = SDValue(N, 0);
5203   EVT VT = V.getValueType();
5204   unsigned Opcode = V.getOpcode();
5205
5206   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5207   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5208     int Elt = SV->getMaskElt(Index);
5209
5210     if (Elt < 0)
5211       return DAG.getUNDEF(VT.getVectorElementType());
5212
5213     unsigned NumElems = VT.getVectorNumElements();
5214     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5215                                          : SV->getOperand(1);
5216     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5217   }
5218
5219   // Recurse into target specific vector shuffles to find scalars.
5220   if (isTargetShuffle(Opcode)) {
5221     MVT ShufVT = V.getSimpleValueType();
5222     MVT ShufSVT = ShufVT.getVectorElementType();
5223     int NumElems = (int)ShufVT.getVectorNumElements();
5224     SmallVector<int, 16> ShuffleMask;
5225     SmallVector<SDValue, 16> ShuffleOps;
5226     bool IsUnary;
5227
5228     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5229       return SDValue();
5230
5231     int Elt = ShuffleMask[Index];
5232     if (Elt == SM_SentinelZero)
5233       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5234                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5235     if (Elt == SM_SentinelUndef)
5236       return DAG.getUNDEF(ShufSVT);
5237
5238     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5239     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5240     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5241                                Depth+1);
5242   }
5243
5244   // Actual nodes that may contain scalar elements
5245   if (Opcode == ISD::BITCAST) {
5246     V = V.getOperand(0);
5247     EVT SrcVT = V.getValueType();
5248     unsigned NumElems = VT.getVectorNumElements();
5249
5250     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5251       return SDValue();
5252   }
5253
5254   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5255     return (Index == 0) ? V.getOperand(0)
5256                         : DAG.getUNDEF(VT.getVectorElementType());
5257
5258   if (V.getOpcode() == ISD::BUILD_VECTOR)
5259     return V.getOperand(Index);
5260
5261   return SDValue();
5262 }
5263
5264 /// Custom lower build_vector of v16i8.
5265 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5266                                        unsigned NumNonZero, unsigned NumZero,
5267                                        SelectionDAG &DAG,
5268                                        const X86Subtarget &Subtarget,
5269                                        const TargetLowering &TLI) {
5270   if (NumNonZero > 8)
5271     return SDValue();
5272
5273   SDLoc dl(Op);
5274   SDValue V;
5275   bool First = true;
5276
5277   // SSE4.1 - use PINSRB to insert each byte directly.
5278   if (Subtarget.hasSSE41()) {
5279     for (unsigned i = 0; i < 16; ++i) {
5280       bool isNonZero = (NonZeros & (1 << i)) != 0;
5281       if (isNonZero) {
5282         if (First) {
5283           if (NumZero)
5284             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5285           else
5286             V = DAG.getUNDEF(MVT::v16i8);
5287           First = false;
5288         }
5289         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5290                         MVT::v16i8, V, Op.getOperand(i),
5291                         DAG.getIntPtrConstant(i, dl));
5292       }
5293     }
5294
5295     return V;
5296   }
5297
5298   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5299   for (unsigned i = 0; i < 16; ++i) {
5300     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5301     if (ThisIsNonZero && First) {
5302       if (NumZero)
5303         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5304       else
5305         V = DAG.getUNDEF(MVT::v8i16);
5306       First = false;
5307     }
5308
5309     if ((i & 1) != 0) {
5310       SDValue ThisElt, LastElt;
5311       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5312       if (LastIsNonZero) {
5313         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5314                               MVT::i16, Op.getOperand(i-1));
5315       }
5316       if (ThisIsNonZero) {
5317         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5318         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5319                               ThisElt, DAG.getConstant(8, dl, MVT::i8));
5320         if (LastIsNonZero)
5321           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5322       } else
5323         ThisElt = LastElt;
5324
5325       if (ThisElt.getNode())
5326         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5327                         DAG.getIntPtrConstant(i/2, dl));
5328     }
5329   }
5330
5331   return DAG.getBitcast(MVT::v16i8, V);
5332 }
5333
5334 /// Custom lower build_vector of v8i16.
5335 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5336                                      unsigned NumNonZero, unsigned NumZero,
5337                                      SelectionDAG &DAG,
5338                                      const X86Subtarget &Subtarget,
5339                                      const TargetLowering &TLI) {
5340   if (NumNonZero > 4)
5341     return SDValue();
5342
5343   SDLoc dl(Op);
5344   SDValue V;
5345   bool First = true;
5346   for (unsigned i = 0; i < 8; ++i) {
5347     bool isNonZero = (NonZeros & (1 << i)) != 0;
5348     if (isNonZero) {
5349       if (First) {
5350         if (NumZero)
5351           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5352         else
5353           V = DAG.getUNDEF(MVT::v8i16);
5354         First = false;
5355       }
5356       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5357                       MVT::v8i16, V, Op.getOperand(i),
5358                       DAG.getIntPtrConstant(i, dl));
5359     }
5360   }
5361
5362   return V;
5363 }
5364
5365 /// Custom lower build_vector of v4i32 or v4f32.
5366 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5367                                      const X86Subtarget &Subtarget,
5368                                      const TargetLowering &TLI) {
5369   // Find all zeroable elements.
5370   std::bitset<4> Zeroable;
5371   for (int i=0; i < 4; ++i) {
5372     SDValue Elt = Op->getOperand(i);
5373     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
5374   }
5375   assert(Zeroable.size() - Zeroable.count() > 1 &&
5376          "We expect at least two non-zero elements!");
5377
5378   // We only know how to deal with build_vector nodes where elements are either
5379   // zeroable or extract_vector_elt with constant index.
5380   SDValue FirstNonZero;
5381   unsigned FirstNonZeroIdx;
5382   for (unsigned i=0; i < 4; ++i) {
5383     if (Zeroable[i])
5384       continue;
5385     SDValue Elt = Op->getOperand(i);
5386     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5387         !isa<ConstantSDNode>(Elt.getOperand(1)))
5388       return SDValue();
5389     // Make sure that this node is extracting from a 128-bit vector.
5390     MVT VT = Elt.getOperand(0).getSimpleValueType();
5391     if (!VT.is128BitVector())
5392       return SDValue();
5393     if (!FirstNonZero.getNode()) {
5394       FirstNonZero = Elt;
5395       FirstNonZeroIdx = i;
5396     }
5397   }
5398
5399   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5400   SDValue V1 = FirstNonZero.getOperand(0);
5401   MVT VT = V1.getSimpleValueType();
5402
5403   // See if this build_vector can be lowered as a blend with zero.
5404   SDValue Elt;
5405   unsigned EltMaskIdx, EltIdx;
5406   int Mask[4];
5407   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5408     if (Zeroable[EltIdx]) {
5409       // The zero vector will be on the right hand side.
5410       Mask[EltIdx] = EltIdx+4;
5411       continue;
5412     }
5413
5414     Elt = Op->getOperand(EltIdx);
5415     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5416     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5417     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5418       break;
5419     Mask[EltIdx] = EltIdx;
5420   }
5421
5422   if (EltIdx == 4) {
5423     // Let the shuffle legalizer deal with blend operations.
5424     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5425     if (V1.getSimpleValueType() != VT)
5426       V1 = DAG.getBitcast(VT, V1);
5427     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
5428   }
5429
5430   // See if we can lower this build_vector to a INSERTPS.
5431   if (!Subtarget.hasSSE41())
5432     return SDValue();
5433
5434   SDValue V2 = Elt.getOperand(0);
5435   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5436     V1 = SDValue();
5437
5438   bool CanFold = true;
5439   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5440     if (Zeroable[i])
5441       continue;
5442
5443     SDValue Current = Op->getOperand(i);
5444     SDValue SrcVector = Current->getOperand(0);
5445     if (!V1.getNode())
5446       V1 = SrcVector;
5447     CanFold = SrcVector == V1 &&
5448       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5449   }
5450
5451   if (!CanFold)
5452     return SDValue();
5453
5454   assert(V1.getNode() && "Expected at least two non-zero elements!");
5455   if (V1.getSimpleValueType() != MVT::v4f32)
5456     V1 = DAG.getBitcast(MVT::v4f32, V1);
5457   if (V2.getSimpleValueType() != MVT::v4f32)
5458     V2 = DAG.getBitcast(MVT::v4f32, V2);
5459
5460   // Ok, we can emit an INSERTPS instruction.
5461   unsigned ZMask = Zeroable.to_ulong();
5462
5463   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5464   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5465   SDLoc DL(Op);
5466   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
5467                                DAG.getIntPtrConstant(InsertPSMask, DL));
5468   return DAG.getBitcast(VT, Result);
5469 }
5470
5471 /// Return a vector logical shift node.
5472 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
5473                          SelectionDAG &DAG, const TargetLowering &TLI,
5474                          const SDLoc &dl) {
5475   assert(VT.is128BitVector() && "Unknown type for VShift");
5476   MVT ShVT = MVT::v16i8;
5477   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5478   SrcOp = DAG.getBitcast(ShVT, SrcOp);
5479   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
5480   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
5481   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
5482   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5483 }
5484
5485 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
5486                                       SelectionDAG &DAG) {
5487
5488   // Check if the scalar load can be widened into a vector load. And if
5489   // the address is "base + cst" see if the cst can be "absorbed" into
5490   // the shuffle mask.
5491   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5492     SDValue Ptr = LD->getBasePtr();
5493     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5494       return SDValue();
5495     EVT PVT = LD->getValueType(0);
5496     if (PVT != MVT::i32 && PVT != MVT::f32)
5497       return SDValue();
5498
5499     int FI = -1;
5500     int64_t Offset = 0;
5501     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5502       FI = FINode->getIndex();
5503       Offset = 0;
5504     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5505                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5506       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5507       Offset = Ptr.getConstantOperandVal(1);
5508       Ptr = Ptr.getOperand(0);
5509     } else {
5510       return SDValue();
5511     }
5512
5513     // FIXME: 256-bit vector instructions don't require a strict alignment,
5514     // improve this code to support it better.
5515     unsigned RequiredAlign = VT.getSizeInBits()/8;
5516     SDValue Chain = LD->getChain();
5517     // Make sure the stack object alignment is at least 16 or 32.
5518     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5519     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5520       if (MFI->isFixedObjectIndex(FI)) {
5521         // Can't change the alignment. FIXME: It's possible to compute
5522         // the exact stack offset and reference FI + adjust offset instead.
5523         // If someone *really* cares about this. That's the way to implement it.
5524         return SDValue();
5525       } else {
5526         MFI->setObjectAlignment(FI, RequiredAlign);
5527       }
5528     }
5529
5530     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5531     // Ptr + (Offset & ~15).
5532     if (Offset < 0)
5533       return SDValue();
5534     if ((Offset % RequiredAlign) & 3)
5535       return SDValue();
5536     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
5537     if (StartOffset) {
5538       SDLoc DL(Ptr);
5539       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5540                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
5541     }
5542
5543     int EltNo = (Offset - StartOffset) >> 2;
5544     unsigned NumElems = VT.getVectorNumElements();
5545
5546     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5547     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5548                              LD->getPointerInfo().getWithOffset(StartOffset));
5549
5550     SmallVector<int, 8> Mask(NumElems, EltNo);
5551
5552     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
5553   }
5554
5555   return SDValue();
5556 }
5557
5558 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
5559 /// elements can be replaced by a single large load which has the same value as
5560 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
5561 ///
5562 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
5563 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
5564                                         SDLoc &DL, SelectionDAG &DAG,
5565                                         bool isAfterLegalize) {
5566   unsigned NumElems = Elts.size();
5567
5568   int LastLoadedElt = -1;
5569   SmallBitVector LoadMask(NumElems, false);
5570   SmallBitVector ZeroMask(NumElems, false);
5571   SmallBitVector UndefMask(NumElems, false);
5572
5573   // For each element in the initializer, see if we've found a load, zero or an
5574   // undef.
5575   for (unsigned i = 0; i < NumElems; ++i) {
5576     SDValue Elt = peekThroughBitcasts(Elts[i]);
5577     if (!Elt.getNode())
5578       return SDValue();
5579
5580     if (Elt.isUndef())
5581       UndefMask[i] = true;
5582     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
5583       ZeroMask[i] = true;
5584     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
5585       LoadMask[i] = true;
5586       LastLoadedElt = i;
5587       // Each loaded element must be the correct fractional portion of the
5588       // requested vector load.
5589       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
5590         return SDValue();
5591     } else
5592       return SDValue();
5593   }
5594   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
5595          "Incomplete element masks");
5596
5597   // Handle Special Cases - all undef or undef/zero.
5598   if (UndefMask.count() == NumElems)
5599     return DAG.getUNDEF(VT);
5600
5601   // FIXME: Should we return this as a BUILD_VECTOR instead?
5602   if ((ZeroMask | UndefMask).count() == NumElems)
5603     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
5604                           : DAG.getConstantFP(0.0, DL, VT);
5605
5606   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5607   int FirstLoadedElt = LoadMask.find_first();
5608   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
5609   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
5610   EVT LDBaseVT = EltBase.getValueType();
5611
5612   // Consecutive loads can contain UNDEFS but not ZERO elements.
5613   // Consecutive loads with UNDEFs and ZEROs elements require a
5614   // an additional shuffle stage to clear the ZERO elements.
5615   bool IsConsecutiveLoad = true;
5616   bool IsConsecutiveLoadWithZeros = true;
5617   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
5618     if (LoadMask[i]) {
5619       SDValue Elt = peekThroughBitcasts(Elts[i]);
5620       LoadSDNode *LD = cast<LoadSDNode>(Elt);
5621       if (!DAG.areNonVolatileConsecutiveLoads(
5622               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
5623               i - FirstLoadedElt)) {
5624         IsConsecutiveLoad = false;
5625         IsConsecutiveLoadWithZeros = false;
5626         break;
5627       }
5628     } else if (ZeroMask[i]) {
5629       IsConsecutiveLoad = false;
5630     }
5631   }
5632
5633   auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
5634     auto MMOFlags = LDBase->getMemOperand()->getFlags();
5635     assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
5636            "Cannot merge volatile loads.");
5637     SDValue NewLd =
5638         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5639                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
5640
5641     if (LDBase->hasAnyUseOfValue(1)) {
5642       SDValue NewChain =
5643           DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5644                       SDValue(NewLd.getNode(), 1));
5645       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5646       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5647                              SDValue(NewLd.getNode(), 1));
5648     }
5649
5650     return NewLd;
5651   };
5652
5653   // LOAD - all consecutive load/undefs (must start/end with a load).
5654   // If we have found an entire vector of loads and undefs, then return a large
5655   // load of the entire vector width starting at the base pointer.
5656   // If the vector contains zeros, then attempt to shuffle those elements.
5657   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
5658       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
5659     assert(LDBase && "Did not find base load for merging consecutive loads");
5660     EVT EltVT = LDBase->getValueType(0);
5661     // Ensure that the input vector size for the merged loads matches the
5662     // cumulative size of the input elements.
5663     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
5664       return SDValue();
5665
5666     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
5667       return SDValue();
5668
5669     if (IsConsecutiveLoad)
5670       return CreateLoad(VT, LDBase);
5671
5672     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
5673     // vector and a zero vector to clear out the zero elements.
5674     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
5675       SmallVector<int, 4> ClearMask(NumElems, -1);
5676       for (unsigned i = 0; i < NumElems; ++i) {
5677         if (ZeroMask[i])
5678           ClearMask[i] = i + NumElems;
5679         else if (LoadMask[i])
5680           ClearMask[i] = i;
5681       }
5682       SDValue V = CreateLoad(VT, LDBase);
5683       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
5684                                  : DAG.getConstantFP(0.0, DL, VT);
5685       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
5686     }
5687   }
5688
5689   int LoadSize =
5690       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
5691
5692   // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
5693   if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
5694       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5695     MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
5696     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
5697     if (TLI.isTypeLegal(VecVT)) {
5698       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
5699       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5700       SDValue ResNode =
5701           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
5702                                   LDBase->getPointerInfo(),
5703                                   LDBase->getAlignment(),
5704                                   false/*isVolatile*/, true/*ReadMem*/,
5705                                   false/*WriteMem*/);
5706
5707       // Make sure the newly-created LOAD is in the same position as LDBase in
5708       // terms of dependency. We create a TokenFactor for LDBase and ResNode,
5709       // and update uses of LDBase's output chain to use the TokenFactor.
5710       if (LDBase->hasAnyUseOfValue(1)) {
5711         SDValue NewChain =
5712             DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5713                         SDValue(ResNode.getNode(), 1));
5714         DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5715         DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5716                                SDValue(ResNode.getNode(), 1));
5717       }
5718
5719       return DAG.getBitcast(VT, ResNode);
5720     }
5721   }
5722
5723   // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
5724   if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
5725       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5726     MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
5727     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
5728     if (TLI.isTypeLegal(VecVT)) {
5729       SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
5730                                      : DAG.getBitcast(VecSVT, EltBase);
5731       V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
5732       V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
5733       return DAG.getBitcast(VT, V);
5734     }
5735   }
5736
5737   return SDValue();
5738 }
5739
5740 /// Attempt to use the vbroadcast instruction to generate a splat value for the
5741 /// following cases:
5742 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5743 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5744 /// a scalar load, or a constant.
5745 /// The VBROADCAST node is returned when a pattern is found,
5746 /// or SDValue() otherwise.
5747 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
5748                                     SelectionDAG &DAG) {
5749   // VBROADCAST requires AVX.
5750   // TODO: Splats could be generated for non-AVX CPUs using SSE
5751   // instructions, but there's less potential gain for only 128-bit vectors.
5752   if (!Subtarget.hasAVX())
5753     return SDValue();
5754
5755   MVT VT = Op.getSimpleValueType();
5756   SDLoc dl(Op);
5757
5758   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5759          "Unsupported vector type for broadcast.");
5760
5761   SDValue Ld;
5762   bool ConstSplatVal;
5763
5764   switch (Op.getOpcode()) {
5765     default:
5766       // Unknown pattern found.
5767       return SDValue();
5768
5769     case ISD::BUILD_VECTOR: {
5770       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
5771       BitVector UndefElements;
5772       SDValue Splat = BVOp->getSplatValue(&UndefElements);
5773
5774       // We need a splat of a single value to use broadcast, and it doesn't
5775       // make any sense if the value is only in one element of the vector.
5776       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
5777         return SDValue();
5778
5779       Ld = Splat;
5780       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5781                        Ld.getOpcode() == ISD::ConstantFP);
5782
5783       // Make sure that all of the users of a non-constant load are from the
5784       // BUILD_VECTOR node.
5785       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
5786         return SDValue();
5787       break;
5788     }
5789
5790     case ISD::VECTOR_SHUFFLE: {
5791       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5792
5793       // Shuffles must have a splat mask where the first element is
5794       // broadcasted.
5795       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5796         return SDValue();
5797
5798       SDValue Sc = Op.getOperand(0);
5799       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5800           Sc.getOpcode() != ISD::BUILD_VECTOR) {
5801
5802         if (!Subtarget.hasInt256())
5803           return SDValue();
5804
5805         // Use the register form of the broadcast instruction available on AVX2.
5806         if (VT.getSizeInBits() >= 256)
5807           Sc = extract128BitVector(Sc, 0, DAG, dl);
5808         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5809       }
5810
5811       Ld = Sc.getOperand(0);
5812       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5813                        Ld.getOpcode() == ISD::ConstantFP);
5814
5815       // The scalar_to_vector node and the suspected
5816       // load node must have exactly one user.
5817       // Constants may have multiple users.
5818
5819       // AVX-512 has register version of the broadcast
5820       bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
5821         Ld.getValueType().getSizeInBits() >= 32;
5822       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
5823           !hasRegVer))
5824         return SDValue();
5825       break;
5826     }
5827   }
5828
5829   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5830   bool IsGE256 = (VT.getSizeInBits() >= 256);
5831
5832   // When optimizing for size, generate up to 5 extra bytes for a broadcast
5833   // instruction to save 8 or more bytes of constant pool data.
5834   // TODO: If multiple splats are generated to load the same constant,
5835   // it may be detrimental to overall size. There needs to be a way to detect
5836   // that condition to know if this is truly a size win.
5837   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
5838
5839   // Handle broadcasting a single constant scalar from the constant pool
5840   // into a vector.
5841   // On Sandybridge (no AVX2), it is still better to load a constant vector
5842   // from the constant pool and not to broadcast it from a scalar.
5843   // But override that restriction when optimizing for size.
5844   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
5845   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
5846     EVT CVT = Ld.getValueType();
5847     assert(!CVT.isVector() && "Must not broadcast a vector type");
5848
5849     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
5850     // For size optimization, also splat v2f64 and v2i64, and for size opt
5851     // with AVX2, also splat i8 and i16.
5852     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
5853     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5854         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
5855       const Constant *C = nullptr;
5856       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5857         C = CI->getConstantIntValue();
5858       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5859         C = CF->getConstantFPValue();
5860
5861       assert(C && "Invalid constant type");
5862
5863       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5864       SDValue CP =
5865           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
5866       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5867       Ld = DAG.getLoad(
5868           CVT, dl, DAG.getEntryNode(), CP,
5869           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
5870           Alignment);
5871
5872       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5873     }
5874   }
5875
5876   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5877
5878   // Handle AVX2 in-register broadcasts.
5879   if (!IsLoad && Subtarget.hasInt256() &&
5880       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
5881     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5882
5883   // The scalar source must be a normal load.
5884   if (!IsLoad)
5885     return SDValue();
5886
5887   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5888       (Subtarget.hasVLX() && ScalarSize == 64))
5889     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5890
5891   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5892   // double since there is no vbroadcastsd xmm
5893   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
5894     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5895       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5896   }
5897
5898   // Unsupported broadcast.
5899   return SDValue();
5900 }
5901
5902 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
5903 /// underlying vector and index.
5904 ///
5905 /// Modifies \p ExtractedFromVec to the real vector and returns the real
5906 /// index.
5907 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
5908                                          SDValue ExtIdx) {
5909   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5910   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
5911     return Idx;
5912
5913   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
5914   // lowered this:
5915   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
5916   // to:
5917   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
5918   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
5919   //                           undef)
5920   //                       Constant<0>)
5921   // In this case the vector is the extract_subvector expression and the index
5922   // is 2, as specified by the shuffle.
5923   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
5924   SDValue ShuffleVec = SVOp->getOperand(0);
5925   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
5926   assert(ShuffleVecVT.getVectorElementType() ==
5927          ExtractedFromVec.getSimpleValueType().getVectorElementType());
5928
5929   int ShuffleIdx = SVOp->getMaskElt(Idx);
5930   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
5931     ExtractedFromVec = ShuffleVec;
5932     return ShuffleIdx;
5933   }
5934   return Idx;
5935 }
5936
5937 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
5938   MVT VT = Op.getSimpleValueType();
5939
5940   // Skip if insert_vec_elt is not supported.
5941   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5942   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5943     return SDValue();
5944
5945   SDLoc DL(Op);
5946   unsigned NumElems = Op.getNumOperands();
5947
5948   SDValue VecIn1;
5949   SDValue VecIn2;
5950   SmallVector<unsigned, 4> InsertIndices;
5951   SmallVector<int, 8> Mask(NumElems, -1);
5952
5953   for (unsigned i = 0; i != NumElems; ++i) {
5954     unsigned Opc = Op.getOperand(i).getOpcode();
5955
5956     if (Opc == ISD::UNDEF)
5957       continue;
5958
5959     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5960       // Quit if more than 1 elements need inserting.
5961       if (InsertIndices.size() > 1)
5962         return SDValue();
5963
5964       InsertIndices.push_back(i);
5965       continue;
5966     }
5967
5968     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5969     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5970     // Quit if non-constant index.
5971     if (!isa<ConstantSDNode>(ExtIdx))
5972       return SDValue();
5973     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
5974
5975     // Quit if extracted from vector of different type.
5976     if (ExtractedFromVec.getValueType() != VT)
5977       return SDValue();
5978
5979     if (!VecIn1.getNode())
5980       VecIn1 = ExtractedFromVec;
5981     else if (VecIn1 != ExtractedFromVec) {
5982       if (!VecIn2.getNode())
5983         VecIn2 = ExtractedFromVec;
5984       else if (VecIn2 != ExtractedFromVec)
5985         // Quit if more than 2 vectors to shuffle
5986         return SDValue();
5987     }
5988
5989     if (ExtractedFromVec == VecIn1)
5990       Mask[i] = Idx;
5991     else if (ExtractedFromVec == VecIn2)
5992       Mask[i] = Idx + NumElems;
5993   }
5994
5995   if (!VecIn1.getNode())
5996     return SDValue();
5997
5998   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5999   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6000   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6001     unsigned Idx = InsertIndices[i];
6002     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6003                      DAG.getIntPtrConstant(Idx, DL));
6004   }
6005
6006   return NV;
6007 }
6008
6009 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6010   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6011          Op.getScalarValueSizeInBits() == 1 &&
6012          "Can not convert non-constant vector");
6013   uint64_t Immediate = 0;
6014   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6015     SDValue In = Op.getOperand(idx);
6016     if (!In.isUndef())
6017       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6018   }
6019   SDLoc dl(Op);
6020   MVT VT =
6021    MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
6022   return DAG.getConstant(Immediate, dl, VT);
6023 }
6024 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6025 SDValue
6026 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6027
6028   MVT VT = Op.getSimpleValueType();
6029   assert((VT.getVectorElementType() == MVT::i1) &&
6030          "Unexpected type in LowerBUILD_VECTORvXi1!");
6031
6032   SDLoc dl(Op);
6033   if (ISD::isBuildVectorAllZeros(Op.getNode()))
6034     return DAG.getTargetConstant(0, dl, VT);
6035
6036   if (ISD::isBuildVectorAllOnes(Op.getNode()))
6037     return DAG.getTargetConstant(1, dl, VT);
6038
6039   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6040     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6041     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6042       return DAG.getBitcast(VT, Imm);
6043     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6044     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6045                         DAG.getIntPtrConstant(0, dl));
6046   }
6047
6048   // Vector has one or more non-const elements
6049   uint64_t Immediate = 0;
6050   SmallVector<unsigned, 16> NonConstIdx;
6051   bool IsSplat = true;
6052   bool HasConstElts = false;
6053   int SplatIdx = -1;
6054   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6055     SDValue In = Op.getOperand(idx);
6056     if (In.isUndef())
6057       continue;
6058     if (!isa<ConstantSDNode>(In))
6059       NonConstIdx.push_back(idx);
6060     else {
6061       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6062       HasConstElts = true;
6063     }
6064     if (SplatIdx < 0)
6065       SplatIdx = idx;
6066     else if (In != Op.getOperand(SplatIdx))
6067       IsSplat = false;
6068   }
6069
6070   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6071   if (IsSplat)
6072     return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6073                        DAG.getConstant(1, dl, VT),
6074                        DAG.getConstant(0, dl, VT));
6075
6076   // insert elements one by one
6077   SDValue DstVec;
6078   SDValue Imm;
6079   if (Immediate) {
6080     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6081     Imm = DAG.getConstant(Immediate, dl, ImmVT);
6082   }
6083   else if (HasConstElts)
6084     Imm = DAG.getConstant(0, dl, VT);
6085   else
6086     Imm = DAG.getUNDEF(VT);
6087   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6088     DstVec = DAG.getBitcast(VT, Imm);
6089   else {
6090     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6091     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6092                          DAG.getIntPtrConstant(0, dl));
6093   }
6094
6095   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6096     unsigned InsertIdx = NonConstIdx[i];
6097     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6098                          Op.getOperand(InsertIdx),
6099                          DAG.getIntPtrConstant(InsertIdx, dl));
6100   }
6101   return DstVec;
6102 }
6103
6104 /// \brief Return true if \p N implements a horizontal binop and return the
6105 /// operands for the horizontal binop into V0 and V1.
6106 ///
6107 /// This is a helper function of LowerToHorizontalOp().
6108 /// This function checks that the build_vector \p N in input implements a
6109 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6110 /// operation to match.
6111 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6112 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6113 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6114 /// arithmetic sub.
6115 ///
6116 /// This function only analyzes elements of \p N whose indices are
6117 /// in range [BaseIdx, LastIdx).
6118 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6119                               SelectionDAG &DAG,
6120                               unsigned BaseIdx, unsigned LastIdx,
6121                               SDValue &V0, SDValue &V1) {
6122   EVT VT = N->getValueType(0);
6123
6124   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6125   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6126          "Invalid Vector in input!");
6127
6128   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6129   bool CanFold = true;
6130   unsigned ExpectedVExtractIdx = BaseIdx;
6131   unsigned NumElts = LastIdx - BaseIdx;
6132   V0 = DAG.getUNDEF(VT);
6133   V1 = DAG.getUNDEF(VT);
6134
6135   // Check if N implements a horizontal binop.
6136   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6137     SDValue Op = N->getOperand(i + BaseIdx);
6138
6139     // Skip UNDEFs.
6140     if (Op->isUndef()) {
6141       // Update the expected vector extract index.
6142       if (i * 2 == NumElts)
6143         ExpectedVExtractIdx = BaseIdx;
6144       ExpectedVExtractIdx += 2;
6145       continue;
6146     }
6147
6148     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6149
6150     if (!CanFold)
6151       break;
6152
6153     SDValue Op0 = Op.getOperand(0);
6154     SDValue Op1 = Op.getOperand(1);
6155
6156     // Try to match the following pattern:
6157     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6158     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6159         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6160         Op0.getOperand(0) == Op1.getOperand(0) &&
6161         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6162         isa<ConstantSDNode>(Op1.getOperand(1)));
6163     if (!CanFold)
6164       break;
6165
6166     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6167     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6168
6169     if (i * 2 < NumElts) {
6170       if (V0.isUndef()) {
6171         V0 = Op0.getOperand(0);
6172         if (V0.getValueType() != VT)
6173           return false;
6174       }
6175     } else {
6176       if (V1.isUndef()) {
6177         V1 = Op0.getOperand(0);
6178         if (V1.getValueType() != VT)
6179           return false;
6180       }
6181       if (i * 2 == NumElts)
6182         ExpectedVExtractIdx = BaseIdx;
6183     }
6184
6185     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6186     if (I0 == ExpectedVExtractIdx)
6187       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6188     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6189       // Try to match the following dag sequence:
6190       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6191       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6192     } else
6193       CanFold = false;
6194
6195     ExpectedVExtractIdx += 2;
6196   }
6197
6198   return CanFold;
6199 }
6200
6201 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6202 /// a concat_vector.
6203 ///
6204 /// This is a helper function of LowerToHorizontalOp().
6205 /// This function expects two 256-bit vectors called V0 and V1.
6206 /// At first, each vector is split into two separate 128-bit vectors.
6207 /// Then, the resulting 128-bit vectors are used to implement two
6208 /// horizontal binary operations.
6209 ///
6210 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6211 ///
6212 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6213 /// the two new horizontal binop.
6214 /// When Mode is set, the first horizontal binop dag node would take as input
6215 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6216 /// horizontal binop dag node would take as input the lower 128-bit of V1
6217 /// and the upper 128-bit of V1.
6218 ///   Example:
6219 ///     HADD V0_LO, V0_HI
6220 ///     HADD V1_LO, V1_HI
6221 ///
6222 /// Otherwise, the first horizontal binop dag node takes as input the lower
6223 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6224 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6225 ///   Example:
6226 ///     HADD V0_LO, V1_LO
6227 ///     HADD V0_HI, V1_HI
6228 ///
6229 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6230 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6231 /// the upper 128-bits of the result.
6232 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6233                                      const SDLoc &DL, SelectionDAG &DAG,
6234                                      unsigned X86Opcode, bool Mode,
6235                                      bool isUndefLO, bool isUndefHI) {
6236   MVT VT = V0.getSimpleValueType();
6237   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6238          "Invalid nodes in input!");
6239
6240   unsigned NumElts = VT.getVectorNumElements();
6241   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6242   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6243   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6244   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6245   MVT NewVT = V0_LO.getSimpleValueType();
6246
6247   SDValue LO = DAG.getUNDEF(NewVT);
6248   SDValue HI = DAG.getUNDEF(NewVT);
6249
6250   if (Mode) {
6251     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6252     if (!isUndefLO && !V0->isUndef())
6253       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6254     if (!isUndefHI && !V1->isUndef())
6255       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6256   } else {
6257     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6258     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6259       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6260
6261     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6262       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6263   }
6264
6265   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6266 }
6267
6268 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6269 /// node.
6270 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6271                              const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6272   MVT VT = BV->getSimpleValueType(0);
6273   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6274       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6275     return SDValue();
6276
6277   SDLoc DL(BV);
6278   unsigned NumElts = VT.getVectorNumElements();
6279   SDValue InVec0 = DAG.getUNDEF(VT);
6280   SDValue InVec1 = DAG.getUNDEF(VT);
6281
6282   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6283           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6284
6285   // Odd-numbered elements in the input build vector are obtained from
6286   // adding two integer/float elements.
6287   // Even-numbered elements in the input build vector are obtained from
6288   // subtracting two integer/float elements.
6289   unsigned ExpectedOpcode = ISD::FSUB;
6290   unsigned NextExpectedOpcode = ISD::FADD;
6291   bool AddFound = false;
6292   bool SubFound = false;
6293
6294   for (unsigned i = 0, e = NumElts; i != e; ++i) {
6295     SDValue Op = BV->getOperand(i);
6296
6297     // Skip 'undef' values.
6298     unsigned Opcode = Op.getOpcode();
6299     if (Opcode == ISD::UNDEF) {
6300       std::swap(ExpectedOpcode, NextExpectedOpcode);
6301       continue;
6302     }
6303
6304     // Early exit if we found an unexpected opcode.
6305     if (Opcode != ExpectedOpcode)
6306       return SDValue();
6307
6308     SDValue Op0 = Op.getOperand(0);
6309     SDValue Op1 = Op.getOperand(1);
6310
6311     // Try to match the following pattern:
6312     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6313     // Early exit if we cannot match that sequence.
6314     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6315         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6316         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6317         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6318         Op0.getOperand(1) != Op1.getOperand(1))
6319       return SDValue();
6320
6321     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6322     if (I0 != i)
6323       return SDValue();
6324
6325     // We found a valid add/sub node. Update the information accordingly.
6326     if (i & 1)
6327       AddFound = true;
6328     else
6329       SubFound = true;
6330
6331     // Update InVec0 and InVec1.
6332     if (InVec0.isUndef()) {
6333       InVec0 = Op0.getOperand(0);
6334       if (InVec0.getSimpleValueType() != VT)
6335         return SDValue();
6336     }
6337     if (InVec1.isUndef()) {
6338       InVec1 = Op1.getOperand(0);
6339       if (InVec1.getSimpleValueType() != VT)
6340         return SDValue();
6341     }
6342
6343     // Make sure that operands in input to each add/sub node always
6344     // come from a same pair of vectors.
6345     if (InVec0 != Op0.getOperand(0)) {
6346       if (ExpectedOpcode == ISD::FSUB)
6347         return SDValue();
6348
6349       // FADD is commutable. Try to commute the operands
6350       // and then test again.
6351       std::swap(Op0, Op1);
6352       if (InVec0 != Op0.getOperand(0))
6353         return SDValue();
6354     }
6355
6356     if (InVec1 != Op1.getOperand(0))
6357       return SDValue();
6358
6359     // Update the pair of expected opcodes.
6360     std::swap(ExpectedOpcode, NextExpectedOpcode);
6361   }
6362
6363   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6364   if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
6365     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6366
6367   return SDValue();
6368 }
6369
6370 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
6371 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
6372                                    const X86Subtarget &Subtarget,
6373                                    SelectionDAG &DAG) {
6374   MVT VT = BV->getSimpleValueType(0);
6375   unsigned NumElts = VT.getVectorNumElements();
6376   unsigned NumUndefsLO = 0;
6377   unsigned NumUndefsHI = 0;
6378   unsigned Half = NumElts/2;
6379
6380   // Count the number of UNDEF operands in the build_vector in input.
6381   for (unsigned i = 0, e = Half; i != e; ++i)
6382     if (BV->getOperand(i)->isUndef())
6383       NumUndefsLO++;
6384
6385   for (unsigned i = Half, e = NumElts; i != e; ++i)
6386     if (BV->getOperand(i)->isUndef())
6387       NumUndefsHI++;
6388
6389   // Early exit if this is either a build_vector of all UNDEFs or all the
6390   // operands but one are UNDEF.
6391   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6392     return SDValue();
6393
6394   SDLoc DL(BV);
6395   SDValue InVec0, InVec1;
6396   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
6397     // Try to match an SSE3 float HADD/HSUB.
6398     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6399       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6400
6401     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6402       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6403   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
6404     // Try to match an SSSE3 integer HADD/HSUB.
6405     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6406       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6407
6408     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6409       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6410   }
6411
6412   if (!Subtarget.hasAVX())
6413     return SDValue();
6414
6415   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6416     // Try to match an AVX horizontal add/sub of packed single/double
6417     // precision floating point values from 256-bit vectors.
6418     SDValue InVec2, InVec3;
6419     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6420         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6421         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6422         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6423       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6424
6425     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6426         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6427         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6428         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6429       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6430   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6431     // Try to match an AVX2 horizontal add/sub of signed integers.
6432     SDValue InVec2, InVec3;
6433     unsigned X86Opcode;
6434     bool CanFold = true;
6435
6436     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6437         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6438         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6439         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6440       X86Opcode = X86ISD::HADD;
6441     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6442         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6443         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6444         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6445       X86Opcode = X86ISD::HSUB;
6446     else
6447       CanFold = false;
6448
6449     if (CanFold) {
6450       // Fold this build_vector into a single horizontal add/sub.
6451       // Do this only if the target has AVX2.
6452       if (Subtarget.hasAVX2())
6453         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6454
6455       // Do not try to expand this build_vector into a pair of horizontal
6456       // add/sub if we can emit a pair of scalar add/sub.
6457       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6458         return SDValue();
6459
6460       // Convert this build_vector into a pair of horizontal binop followed by
6461       // a concat vector.
6462       bool isUndefLO = NumUndefsLO == Half;
6463       bool isUndefHI = NumUndefsHI == Half;
6464       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6465                                    isUndefLO, isUndefHI);
6466     }
6467   }
6468
6469   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6470        VT == MVT::v16i16) && Subtarget.hasAVX()) {
6471     unsigned X86Opcode;
6472     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6473       X86Opcode = X86ISD::HADD;
6474     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6475       X86Opcode = X86ISD::HSUB;
6476     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6477       X86Opcode = X86ISD::FHADD;
6478     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6479       X86Opcode = X86ISD::FHSUB;
6480     else
6481       return SDValue();
6482
6483     // Don't try to expand this build_vector into a pair of horizontal add/sub
6484     // if we can simply emit a pair of scalar add/sub.
6485     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6486       return SDValue();
6487
6488     // Convert this build_vector into two horizontal add/sub followed by
6489     // a concat vector.
6490     bool isUndefLO = NumUndefsLO == Half;
6491     bool isUndefHI = NumUndefsHI == Half;
6492     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6493                                  isUndefLO, isUndefHI);
6494   }
6495
6496   return SDValue();
6497 }
6498
6499 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
6500 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
6501 /// just apply the bit to the vectors.
6502 /// NOTE: Its not in our interest to start make a general purpose vectorizer
6503 /// from this, but enough scalar bit operations are created from the later
6504 /// legalization + scalarization stages to need basic support.
6505 static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
6506   SDLoc DL(Op);
6507   MVT VT = Op.getSimpleValueType();
6508   unsigned NumElems = VT.getVectorNumElements();
6509   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6510
6511   // Check that all elements have the same opcode.
6512   // TODO: Should we allow UNDEFS and if so how many?
6513   unsigned Opcode = Op.getOperand(0).getOpcode();
6514   for (unsigned i = 1; i < NumElems; ++i)
6515     if (Opcode != Op.getOperand(i).getOpcode())
6516       return SDValue();
6517
6518   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
6519   switch (Opcode) {
6520   default:
6521     return SDValue();
6522   case ISD::AND:
6523   case ISD::XOR:
6524   case ISD::OR:
6525     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
6526       return SDValue();
6527     break;
6528   }
6529
6530   SmallVector<SDValue, 4> LHSElts, RHSElts;
6531   for (SDValue Elt : Op->ops()) {
6532     SDValue LHS = Elt.getOperand(0);
6533     SDValue RHS = Elt.getOperand(1);
6534
6535     // We expect the canonicalized RHS operand to be the constant.
6536     if (!isa<ConstantSDNode>(RHS))
6537       return SDValue();
6538     LHSElts.push_back(LHS);
6539     RHSElts.push_back(RHS);
6540   }
6541
6542   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
6543   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
6544   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
6545 }
6546
6547 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
6548 /// functionality to do this, so it's all zeros, all ones, or some derivation
6549 /// that is cheap to calculate.
6550 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
6551                                          const X86Subtarget &Subtarget) {
6552   SDLoc DL(Op);
6553   MVT VT = Op.getSimpleValueType();
6554
6555   // Vectors containing all zeros can be matched by pxor and xorps.
6556   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6557     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6558     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6559     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6560       return Op;
6561
6562     return getZeroVector(VT, Subtarget, DAG, DL);
6563   }
6564
6565   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6566   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6567   // vpcmpeqd on 256-bit vectors.
6568   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6569     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
6570         (VT == MVT::v8i32 && Subtarget.hasInt256()))
6571       return Op;
6572
6573     return getOnesVector(VT, Subtarget, DAG, DL);
6574   }
6575
6576   return SDValue();
6577 }
6578
6579 SDValue
6580 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6581   SDLoc dl(Op);
6582
6583   MVT VT = Op.getSimpleValueType();
6584   MVT ExtVT = VT.getVectorElementType();
6585   unsigned NumElems = Op.getNumOperands();
6586
6587   // Generate vectors for predicate vectors.
6588   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
6589     return LowerBUILD_VECTORvXi1(Op, DAG);
6590
6591   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
6592     return VectorConstant;
6593
6594   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
6595   if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
6596     return AddSub;
6597   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
6598     return HorizontalOp;
6599   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
6600     return Broadcast;
6601   if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
6602     return BitOp;
6603
6604   unsigned EVTBits = ExtVT.getSizeInBits();
6605
6606   unsigned NumZero  = 0;
6607   unsigned NumNonZero = 0;
6608   uint64_t NonZeros = 0;
6609   bool IsAllConstants = true;
6610   SmallSet<SDValue, 8> Values;
6611   for (unsigned i = 0; i < NumElems; ++i) {
6612     SDValue Elt = Op.getOperand(i);
6613     if (Elt.isUndef())
6614       continue;
6615     Values.insert(Elt);
6616     if (Elt.getOpcode() != ISD::Constant &&
6617         Elt.getOpcode() != ISD::ConstantFP)
6618       IsAllConstants = false;
6619     if (X86::isZeroNode(Elt))
6620       NumZero++;
6621     else {
6622       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
6623       NonZeros |= ((uint64_t)1 << i);
6624       NumNonZero++;
6625     }
6626   }
6627
6628   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6629   if (NumNonZero == 0)
6630     return DAG.getUNDEF(VT);
6631
6632   // Special case for single non-zero, non-undef, element.
6633   if (NumNonZero == 1) {
6634     unsigned Idx = countTrailingZeros(NonZeros);
6635     SDValue Item = Op.getOperand(Idx);
6636
6637     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6638     // the value are obviously zero, truncate the value to i32 and do the
6639     // insertion that way.  Only do this if the value is non-constant or if the
6640     // value is a constant being inserted into element 0.  It is cheaper to do
6641     // a constant pool load than it is to do a movd + shuffle.
6642     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
6643         (!IsAllConstants || Idx == 0)) {
6644       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6645         // Handle SSE only.
6646         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6647         MVT VecVT = MVT::v4i32;
6648
6649         // Truncate the value (which may itself be a constant) to i32, and
6650         // convert it to a vector with movd (S2V+shuffle to zero extend).
6651         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6652         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6653         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
6654                                       Item, Idx * 2, true, Subtarget, DAG));
6655       }
6656     }
6657
6658     // If we have a constant or non-constant insertion into the low element of
6659     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6660     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6661     // depending on what the source datatype is.
6662     if (Idx == 0) {
6663       if (NumZero == 0)
6664         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6665
6666       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6667           (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
6668         if (VT.is512BitVector()) {
6669           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6670           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6671                              Item, DAG.getIntPtrConstant(0, dl));
6672         }
6673         assert((VT.is128BitVector() || VT.is256BitVector()) &&
6674                "Expected an SSE value type!");
6675         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6676         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6677         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6678       }
6679
6680       // We can't directly insert an i8 or i16 into a vector, so zero extend
6681       // it to i32 first.
6682       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
6683         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
6684         if (VT.getSizeInBits() >= 256) {
6685           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
6686           if (Subtarget.hasAVX()) {
6687             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
6688             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6689           } else {
6690             // Without AVX, we need to extend to a 128-bit vector and then
6691             // insert into the 256-bit vector.
6692             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6693             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
6694             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
6695           }
6696         } else {
6697           assert(VT.is128BitVector() && "Expected an SSE value type!");
6698           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6699           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6700         }
6701         return DAG.getBitcast(VT, Item);
6702       }
6703     }
6704
6705     // Is it a vector logical left shift?
6706     if (NumElems == 2 && Idx == 1 &&
6707         X86::isZeroNode(Op.getOperand(0)) &&
6708         !X86::isZeroNode(Op.getOperand(1))) {
6709       unsigned NumBits = VT.getSizeInBits();
6710       return getVShift(true, VT,
6711                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6712                                    VT, Op.getOperand(1)),
6713                        NumBits/2, DAG, *this, dl);
6714     }
6715
6716     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
6717       return SDValue();
6718
6719     // Otherwise, if this is a vector with i32 or f32 elements, and the element
6720     // is a non-constant being inserted into an element other than the low one,
6721     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
6722     // movd/movss) to move this into the low element, then shuffle it into
6723     // place.
6724     if (EVTBits == 32) {
6725       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6726       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
6727     }
6728   }
6729
6730   // Splat is obviously ok. Let legalizer expand it to a shuffle.
6731   if (Values.size() == 1) {
6732     if (EVTBits == 32) {
6733       // Instead of a shuffle like this:
6734       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
6735       // Check if it's possible to issue this instead.
6736       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
6737       unsigned Idx = countTrailingZeros(NonZeros);
6738       SDValue Item = Op.getOperand(Idx);
6739       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
6740         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
6741     }
6742     return SDValue();
6743   }
6744
6745   // A vector full of immediates; various special cases are already
6746   // handled, so this is best done with a single constant-pool load.
6747   if (IsAllConstants)
6748     return SDValue();
6749
6750   // See if we can use a vector load to get all of the elements.
6751   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
6752     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6753     if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
6754       return LD;
6755   }
6756
6757   // For AVX-length vectors, build the individual 128-bit pieces and use
6758   // shuffles to put them in place.
6759   if (VT.is256BitVector() || VT.is512BitVector()) {
6760     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6761
6762     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
6763
6764     // Build both the lower and upper subvector.
6765     SDValue Lower =
6766         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
6767     SDValue Upper = DAG.getBuildVector(
6768         HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
6769
6770     // Recreate the wider vector with the lower and upper part.
6771     if (VT.is256BitVector())
6772       return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6773     return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6774   }
6775
6776   // Let legalizer expand 2-wide build_vectors.
6777   if (EVTBits == 64) {
6778     if (NumNonZero == 1) {
6779       // One half is zero or undef.
6780       unsigned Idx = countTrailingZeros(NonZeros);
6781       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
6782                                Op.getOperand(Idx));
6783       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
6784     }
6785     return SDValue();
6786   }
6787
6788   // If element VT is < 32 bits, convert it to inserts into a zero vector.
6789   if (EVTBits == 8 && NumElems == 16)
6790     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
6791                                           DAG, Subtarget, *this))
6792       return V;
6793
6794   if (EVTBits == 16 && NumElems == 8)
6795     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
6796                                           DAG, Subtarget, *this))
6797       return V;
6798
6799   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
6800   if (EVTBits == 32 && NumElems == 4)
6801     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
6802       return V;
6803
6804   // If element VT is == 32 bits, turn it into a number of shuffles.
6805   if (NumElems == 4 && NumZero > 0) {
6806     SmallVector<SDValue, 8> Ops(NumElems);
6807     for (unsigned i = 0; i < 4; ++i) {
6808       bool isZero = !(NonZeros & (1ULL << i));
6809       if (isZero)
6810         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
6811       else
6812         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6813     }
6814
6815     for (unsigned i = 0; i < 2; ++i) {
6816       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
6817         default: break;
6818         case 0:
6819           Ops[i] = Ops[i*2];  // Must be a zero vector.
6820           break;
6821         case 1:
6822           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
6823           break;
6824         case 2:
6825           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6826           break;
6827         case 3:
6828           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6829           break;
6830       }
6831     }
6832
6833     bool Reverse1 = (NonZeros & 0x3) == 2;
6834     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
6835     int MaskVec[] = {
6836       Reverse1 ? 1 : 0,
6837       Reverse1 ? 0 : 1,
6838       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
6839       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
6840     };
6841     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
6842   }
6843
6844   if (Values.size() > 1 && VT.is128BitVector()) {
6845     // Check for a build vector from mostly shuffle plus few inserting.
6846     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
6847       return Sh;
6848
6849     // For SSE 4.1, use insertps to put the high elements into the low element.
6850     if (Subtarget.hasSSE41()) {
6851       SDValue Result;
6852       if (!Op.getOperand(0).isUndef())
6853         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
6854       else
6855         Result = DAG.getUNDEF(VT);
6856
6857       for (unsigned i = 1; i < NumElems; ++i) {
6858         if (Op.getOperand(i).isUndef()) continue;
6859         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
6860                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6861       }
6862       return Result;
6863     }
6864
6865     // Otherwise, expand into a number of unpckl*, start by extending each of
6866     // our (non-undef) elements to the full vector width with the element in the
6867     // bottom slot of the vector (which generates no code for SSE).
6868     SmallVector<SDValue, 8> Ops(NumElems);
6869     for (unsigned i = 0; i < NumElems; ++i) {
6870       if (!Op.getOperand(i).isUndef())
6871         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6872       else
6873         Ops[i] = DAG.getUNDEF(VT);
6874     }
6875
6876     // Next, we iteratively mix elements, e.g. for v4f32:
6877     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
6878     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
6879     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
6880     unsigned EltStride = NumElems >> 1;
6881     while (EltStride != 0) {
6882       for (unsigned i = 0; i < EltStride; ++i) {
6883         // If Ops[i+EltStride] is undef and this is the first round of mixing,
6884         // then it is safe to just drop this shuffle: V[i] is already in the
6885         // right place, the one element (since it's the first round) being
6886         // inserted as undef can be dropped.  This isn't safe for successive
6887         // rounds because they will permute elements within both vectors.
6888         if (Ops[i+EltStride].isUndef() &&
6889             EltStride == NumElems/2)
6890           continue;
6891
6892         Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
6893       }
6894       EltStride >>= 1;
6895     }
6896     return Ops[0];
6897   }
6898   return SDValue();
6899 }
6900
6901 // 256-bit AVX can use the vinsertf128 instruction
6902 // to create 256-bit vectors from two other 128-bit ones.
6903 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6904   SDLoc dl(Op);
6905   MVT ResVT = Op.getSimpleValueType();
6906
6907   assert((ResVT.is256BitVector() ||
6908           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
6909
6910   SDValue V1 = Op.getOperand(0);
6911   SDValue V2 = Op.getOperand(1);
6912   unsigned NumElems = ResVT.getVectorNumElements();
6913   if (ResVT.is256BitVector())
6914     return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6915
6916   if (Op.getNumOperands() == 4) {
6917     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6918                                   ResVT.getVectorNumElements()/2);
6919     SDValue V3 = Op.getOperand(2);
6920     SDValue V4 = Op.getOperand(3);
6921     return concat256BitVectors(
6922         concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
6923         concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
6924         NumElems, DAG, dl);
6925   }
6926   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6927 }
6928
6929 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
6930                                        const X86Subtarget &Subtarget,
6931                                        SelectionDAG & DAG) {
6932   SDLoc dl(Op);
6933   MVT ResVT = Op.getSimpleValueType();
6934   unsigned NumOfOperands = Op.getNumOperands();
6935
6936   assert(isPowerOf2_32(NumOfOperands) &&
6937          "Unexpected number of operands in CONCAT_VECTORS");
6938
6939   SDValue Undef = DAG.getUNDEF(ResVT);
6940   if (NumOfOperands > 2) {
6941     // Specialize the cases when all, or all but one, of the operands are undef.
6942     unsigned NumOfDefinedOps = 0;
6943     unsigned OpIdx = 0;
6944     for (unsigned i = 0; i < NumOfOperands; i++)
6945       if (!Op.getOperand(i).isUndef()) {
6946         NumOfDefinedOps++;
6947         OpIdx = i;
6948       }
6949     if (NumOfDefinedOps == 0)
6950       return Undef;
6951     if (NumOfDefinedOps == 1) {
6952       unsigned SubVecNumElts =
6953         Op.getOperand(OpIdx).getValueType().getVectorNumElements();
6954       SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
6955       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
6956                          Op.getOperand(OpIdx), IdxVal);
6957     }
6958
6959     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6960                                   ResVT.getVectorNumElements()/2);
6961     SmallVector<SDValue, 2> Ops;
6962     for (unsigned i = 0; i < NumOfOperands/2; i++)
6963       Ops.push_back(Op.getOperand(i));
6964     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6965     Ops.clear();
6966     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
6967       Ops.push_back(Op.getOperand(i));
6968     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6969     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
6970   }
6971
6972   // 2 operands
6973   SDValue V1 = Op.getOperand(0);
6974   SDValue V2 = Op.getOperand(1);
6975   unsigned NumElems = ResVT.getVectorNumElements();
6976   assert(V1.getValueType() == V2.getValueType() &&
6977          V1.getValueType().getVectorNumElements() == NumElems/2 &&
6978          "Unexpected operands in CONCAT_VECTORS");
6979
6980   if (ResVT.getSizeInBits() >= 16)
6981     return Op; // The operation is legal with KUNPCK
6982
6983   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
6984   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
6985   SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
6986   if (IsZeroV1 && IsZeroV2)
6987     return ZeroVec;
6988
6989   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6990   if (V2.isUndef())
6991     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
6992   if (IsZeroV2)
6993     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
6994
6995   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
6996   if (V1.isUndef())
6997     V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
6998
6999   if (IsZeroV1)
7000     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7001
7002   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7003   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7004 }
7005
7006 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7007                                    const X86Subtarget &Subtarget,
7008                                    SelectionDAG &DAG) {
7009   MVT VT = Op.getSimpleValueType();
7010   if (VT.getVectorElementType() == MVT::i1)
7011     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7012
7013   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7014          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7015           Op.getNumOperands() == 4)));
7016
7017   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7018   // from two other 128-bit ones.
7019
7020   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7021   return LowerAVXCONCAT_VECTORS(Op, DAG);
7022 }
7023
7024 //===----------------------------------------------------------------------===//
7025 // Vector shuffle lowering
7026 //
7027 // This is an experimental code path for lowering vector shuffles on x86. It is
7028 // designed to handle arbitrary vector shuffles and blends, gracefully
7029 // degrading performance as necessary. It works hard to recognize idiomatic
7030 // shuffles and lower them to optimal instruction patterns without leaving
7031 // a framework that allows reasonably efficient handling of all vector shuffle
7032 // patterns.
7033 //===----------------------------------------------------------------------===//
7034
7035 /// \brief Tiny helper function to identify a no-op mask.
7036 ///
7037 /// This is a somewhat boring predicate function. It checks whether the mask
7038 /// array input, which is assumed to be a single-input shuffle mask of the kind
7039 /// used by the X86 shuffle instructions (not a fully general
7040 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7041 /// in-place shuffle are 'no-op's.
7042 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7043   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7044     assert(Mask[i] >= -1 && "Out of bound mask element!");
7045     if (Mask[i] >= 0 && Mask[i] != i)
7046       return false;
7047   }
7048   return true;
7049 }
7050
7051 /// \brief Test whether there are elements crossing 128-bit lanes in this
7052 /// shuffle mask.
7053 ///
7054 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7055 /// and we routinely test for these.
7056 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7057   int LaneSize = 128 / VT.getScalarSizeInBits();
7058   int Size = Mask.size();
7059   for (int i = 0; i < Size; ++i)
7060     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7061       return true;
7062   return false;
7063 }
7064
7065 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7066 ///
7067 /// This checks a shuffle mask to see if it is performing the same
7068 /// lane-relative shuffle in each sub-lane. This trivially implies
7069 /// that it is also not lane-crossing. It may however involve a blend from the
7070 /// same lane of a second vector.
7071 ///
7072 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7073 /// non-trivial to compute in the face of undef lanes. The representation is
7074 /// suitable for use with existing 128-bit shuffles as entries from the second
7075 /// vector have been remapped to [LaneSize, 2*LaneSize).
7076 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7077                                   ArrayRef<int> Mask,
7078                                   SmallVectorImpl<int> &RepeatedMask) {
7079   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7080   RepeatedMask.assign(LaneSize, -1);
7081   int Size = Mask.size();
7082   for (int i = 0; i < Size; ++i) {
7083     if (Mask[i] < 0)
7084       continue;
7085     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7086       // This entry crosses lanes, so there is no way to model this shuffle.
7087       return false;
7088
7089     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7090     // Adjust second vector indices to start at LaneSize instead of Size.
7091     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7092                                 : Mask[i] % LaneSize + LaneSize;
7093     if (RepeatedMask[i % LaneSize] < 0)
7094       // This is the first non-undef entry in this slot of a 128-bit lane.
7095       RepeatedMask[i % LaneSize] = LocalM;
7096     else if (RepeatedMask[i % LaneSize] != LocalM)
7097       // Found a mismatch with the repeated mask.
7098       return false;
7099   }
7100   return true;
7101 }
7102
7103 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7104 static bool
7105 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7106                                 SmallVectorImpl<int> &RepeatedMask) {
7107   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7108 }
7109
7110 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7111 static bool
7112 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7113                                 SmallVectorImpl<int> &RepeatedMask) {
7114   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7115 }
7116
7117 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
7118                              SmallVectorImpl<int> &ScaledMask) {
7119   assert(0 < Scale && "Unexpected scaling factor");
7120   int NumElts = Mask.size();
7121   ScaledMask.assign(NumElts * Scale, -1);
7122
7123   for (int i = 0; i != NumElts; ++i) {
7124     int M = Mask[i];
7125
7126     // Repeat sentinel values in every mask element.
7127     if (M < 0) {
7128       for (int s = 0; s != Scale; ++s)
7129         ScaledMask[(Scale * i) + s] = M;
7130       continue;
7131     }
7132
7133     // Scale mask element and increment across each mask element.
7134     for (int s = 0; s != Scale; ++s)
7135       ScaledMask[(Scale * i) + s] = (Scale * M) + s;
7136   }
7137 }
7138
7139 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7140 /// arguments.
7141 ///
7142 /// This is a fast way to test a shuffle mask against a fixed pattern:
7143 ///
7144 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7145 ///
7146 /// It returns true if the mask is exactly as wide as the argument list, and
7147 /// each element of the mask is either -1 (signifying undef) or the value given
7148 /// in the argument.
7149 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7150                                 ArrayRef<int> ExpectedMask) {
7151   if (Mask.size() != ExpectedMask.size())
7152     return false;
7153
7154   int Size = Mask.size();
7155
7156   // If the values are build vectors, we can look through them to find
7157   // equivalent inputs that make the shuffles equivalent.
7158   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7159   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7160
7161   for (int i = 0; i < Size; ++i) {
7162     assert(Mask[i] >= -1 && "Out of bound mask element!");
7163     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7164       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7165       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7166       if (!MaskBV || !ExpectedBV ||
7167           MaskBV->getOperand(Mask[i] % Size) !=
7168               ExpectedBV->getOperand(ExpectedMask[i] % Size))
7169         return false;
7170     }
7171 }
7172
7173   return true;
7174 }
7175
7176 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7177 ///
7178 /// The masks must be exactly the same width.
7179 ///
7180 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7181 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7182 ///
7183 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
7184 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7185                                       ArrayRef<int> ExpectedMask) {
7186   int Size = Mask.size();
7187   if (Size != (int)ExpectedMask.size())
7188     return false;
7189
7190   for (int i = 0; i < Size; ++i)
7191     if (Mask[i] == SM_SentinelUndef)
7192       continue;
7193     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7194       return false;
7195     else if (Mask[i] != ExpectedMask[i])
7196       return false;
7197
7198   return true;
7199 }
7200
7201 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7202 ///
7203 /// This helper function produces an 8-bit shuffle immediate corresponding to
7204 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7205 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7206 /// example.
7207 ///
7208 /// NB: We rely heavily on "undef" masks preserving the input lane.
7209 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7210   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7211   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7212   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7213   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7214   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7215
7216   unsigned Imm = 0;
7217   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
7218   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
7219   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
7220   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
7221   return Imm;
7222 }
7223
7224 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
7225                                           SelectionDAG &DAG) {
7226   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
7227 }
7228
7229 /// \brief Compute whether each element of a shuffle is zeroable.
7230 ///
7231 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7232 /// Either it is an undef element in the shuffle mask, the element of the input
7233 /// referenced is undef, or the element of the input referenced is known to be
7234 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7235 /// as many lanes with this technique as possible to simplify the remaining
7236 /// shuffle.
7237 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7238                                                      SDValue V1, SDValue V2) {
7239   SmallBitVector Zeroable(Mask.size(), false);
7240   V1 = peekThroughBitcasts(V1);
7241   V2 = peekThroughBitcasts(V2);
7242
7243   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7244   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7245
7246   int VectorSizeInBits = V1.getValueType().getSizeInBits();
7247   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
7248   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7249
7250   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7251     int M = Mask[i];
7252     // Handle the easy cases.
7253     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7254       Zeroable[i] = true;
7255       continue;
7256     }
7257
7258     // Determine shuffle input and normalize the mask.
7259     SDValue V = M < Size ? V1 : V2;
7260     M %= Size;
7261
7262     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7263     if (V.getOpcode() != ISD::BUILD_VECTOR)
7264       continue;
7265
7266     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7267     // the (larger) source element must be UNDEF/ZERO.
7268     if ((Size % V.getNumOperands()) == 0) {
7269       int Scale = Size / V->getNumOperands();
7270       SDValue Op = V.getOperand(M / Scale);
7271       if (Op.isUndef() || X86::isZeroNode(Op))
7272         Zeroable[i] = true;
7273       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7274         APInt Val = Cst->getAPIntValue();
7275         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7276         Val = Val.getLoBits(ScalarSizeInBits);
7277         Zeroable[i] = (Val == 0);
7278       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7279         APInt Val = Cst->getValueAPF().bitcastToAPInt();
7280         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7281         Val = Val.getLoBits(ScalarSizeInBits);
7282         Zeroable[i] = (Val == 0);
7283       }
7284       continue;
7285     }
7286
7287     // If the BUILD_VECTOR has more elements then all the (smaller) source
7288     // elements must be UNDEF or ZERO.
7289     if ((V.getNumOperands() % Size) == 0) {
7290       int Scale = V->getNumOperands() / Size;
7291       bool AllZeroable = true;
7292       for (int j = 0; j < Scale; ++j) {
7293         SDValue Op = V.getOperand((M * Scale) + j);
7294         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
7295       }
7296       Zeroable[i] = AllZeroable;
7297       continue;
7298     }
7299   }
7300
7301   return Zeroable;
7302 }
7303
7304 /// Try to lower a shuffle with a single PSHUFB of V1.
7305 /// This is only possible if V2 is unused (at all, or only for zero elements).
7306 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
7307                                             ArrayRef<int> Mask, SDValue V1,
7308                                             SDValue V2,
7309                                             const X86Subtarget &Subtarget,
7310                                             SelectionDAG &DAG) {
7311   int Size = Mask.size();
7312   int LaneSize = 128 / VT.getScalarSizeInBits();
7313   const int NumBytes = VT.getSizeInBits() / 8;
7314   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
7315
7316   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
7317          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
7318          (Subtarget.hasBWI() && VT.is512BitVector()));
7319
7320   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7321
7322   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
7323   // Sign bit set in i8 mask means zero element.
7324   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
7325
7326   for (int i = 0; i < NumBytes; ++i) {
7327     int M = Mask[i / NumEltBytes];
7328     if (M < 0) {
7329       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
7330       continue;
7331     }
7332     if (Zeroable[i / NumEltBytes]) {
7333       PSHUFBMask[i] = ZeroMask;
7334       continue;
7335     }
7336     // Only allow V1.
7337     if (M >= Size)
7338       return SDValue();
7339
7340     // PSHUFB can't cross lanes, ensure this doesn't happen.
7341     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
7342       return SDValue();
7343
7344     M = M % LaneSize;
7345     M = M * NumEltBytes + (i % NumEltBytes);
7346     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
7347   }
7348
7349   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
7350   return DAG.getBitcast(
7351       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
7352                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
7353 }
7354
7355 // X86 has dedicated unpack instructions that can handle specific blend
7356 // operations: UNPCKH and UNPCKL.
7357 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
7358                                            ArrayRef<int> Mask, SDValue V1,
7359                                            SDValue V2, SelectionDAG &DAG) {
7360   int NumElts = VT.getVectorNumElements();
7361   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7362   SmallVector<int, 8> Unpckl(NumElts);
7363   SmallVector<int, 8> Unpckh(NumElts);
7364
7365   for (int i = 0; i < NumElts; ++i) {
7366     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7367     int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
7368     int HiPos = LoPos + NumEltsInLane / 2;
7369     Unpckl[i] = LoPos;
7370     Unpckh[i] = HiPos;
7371   }
7372
7373   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7374     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
7375   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7376     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
7377
7378   // Commute and try again.
7379   ShuffleVectorSDNode::commuteMask(Unpckl);
7380   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7381     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
7382
7383   ShuffleVectorSDNode::commuteMask(Unpckh);
7384   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7385     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
7386
7387   return SDValue();
7388 }
7389
7390 /// \brief Try to emit a bitmask instruction for a shuffle.
7391 ///
7392 /// This handles cases where we can model a blend exactly as a bitmask due to
7393 /// one of the inputs being zeroable.
7394 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
7395                                            SDValue V2, ArrayRef<int> Mask,
7396                                            SelectionDAG &DAG) {
7397   MVT EltVT = VT.getVectorElementType();
7398   int NumEltBits = EltVT.getSizeInBits();
7399   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7400   SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
7401   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7402                                     IntEltVT);
7403   if (EltVT.isFloatingPoint()) {
7404     Zero = DAG.getBitcast(EltVT, Zero);
7405     AllOnes = DAG.getBitcast(EltVT, AllOnes);
7406   }
7407   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7408   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7409   SDValue V;
7410   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7411     if (Zeroable[i])
7412       continue;
7413     if (Mask[i] % Size != i)
7414       return SDValue(); // Not a blend.
7415     if (!V)
7416       V = Mask[i] < Size ? V1 : V2;
7417     else if (V != (Mask[i] < Size ? V1 : V2))
7418       return SDValue(); // Can only let one input through the mask.
7419
7420     VMaskOps[i] = AllOnes;
7421   }
7422   if (!V)
7423     return SDValue(); // No non-zeroable elements!
7424
7425   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
7426   V = DAG.getNode(VT.isFloatingPoint()
7427                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7428                   DL, VT, V, VMask);
7429   return V;
7430 }
7431
7432 /// \brief Try to emit a blend instruction for a shuffle using bit math.
7433 ///
7434 /// This is used as a fallback approach when first class blend instructions are
7435 /// unavailable. Currently it is only suitable for integer vectors, but could
7436 /// be generalized for floating point vectors if desirable.
7437 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
7438                                             SDValue V2, ArrayRef<int> Mask,
7439                                             SelectionDAG &DAG) {
7440   assert(VT.isInteger() && "Only supports integer vector types!");
7441   MVT EltVT = VT.getVectorElementType();
7442   int NumEltBits = EltVT.getSizeInBits();
7443   SDValue Zero = DAG.getConstant(0, DL, EltVT);
7444   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7445                                     EltVT);
7446   SmallVector<SDValue, 16> MaskOps;
7447   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7448     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
7449       return SDValue(); // Shuffled input!
7450     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
7451   }
7452
7453   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
7454   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
7455   // We have to cast V2 around.
7456   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
7457   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
7458                                       DAG.getBitcast(MaskVT, V1Mask),
7459                                       DAG.getBitcast(MaskVT, V2)));
7460   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
7461 }
7462
7463 /// \brief Try to emit a blend instruction for a shuffle.
7464 ///
7465 /// This doesn't do any checks for the availability of instructions for blending
7466 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7467 /// be matched in the backend with the type given. What it does check for is
7468 /// that the shuffle mask is a blend, or convertible into a blend with zero.
7469 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
7470                                          SDValue V2, ArrayRef<int> Original,
7471                                          const X86Subtarget &Subtarget,
7472                                          SelectionDAG &DAG) {
7473   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7474   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7475   SmallVector<int, 8> Mask(Original.begin(), Original.end());
7476   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7477   bool ForceV1Zero = false, ForceV2Zero = false;
7478
7479   // Attempt to generate the binary blend mask. If an input is zero then
7480   // we can use any lane.
7481   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
7482   unsigned BlendMask = 0;
7483   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7484     int M = Mask[i];
7485     if (M < 0)
7486       continue;
7487     if (M == i)
7488       continue;
7489     if (M == i + Size) {
7490       BlendMask |= 1u << i;
7491       continue;
7492     }
7493     if (Zeroable[i]) {
7494       if (V1IsZero) {
7495         ForceV1Zero = true;
7496         Mask[i] = i;
7497         continue;
7498       }
7499       if (V2IsZero) {
7500         ForceV2Zero = true;
7501         BlendMask |= 1u << i;
7502         Mask[i] = i + Size;
7503         continue;
7504       }
7505     }
7506     return SDValue(); // Shuffled input!
7507   }
7508
7509   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
7510   if (ForceV1Zero)
7511     V1 = getZeroVector(VT, Subtarget, DAG, DL);
7512   if (ForceV2Zero)
7513     V2 = getZeroVector(VT, Subtarget, DAG, DL);
7514
7515   auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
7516     unsigned ScaledMask = 0;
7517     for (int i = 0; i != Size; ++i)
7518       if (BlendMask & (1u << i))
7519         for (int j = 0; j != Scale; ++j)
7520           ScaledMask |= 1u << (i * Scale + j);
7521     return ScaledMask;
7522   };
7523
7524   switch (VT.SimpleTy) {
7525   case MVT::v2f64:
7526   case MVT::v4f32:
7527   case MVT::v4f64:
7528   case MVT::v8f32:
7529     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7530                        DAG.getConstant(BlendMask, DL, MVT::i8));
7531
7532   case MVT::v4i64:
7533   case MVT::v8i32:
7534     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7535     // FALLTHROUGH
7536   case MVT::v2i64:
7537   case MVT::v4i32:
7538     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7539     // that instruction.
7540     if (Subtarget.hasAVX2()) {
7541       // Scale the blend by the number of 32-bit dwords per element.
7542       int Scale =  VT.getScalarSizeInBits() / 32;
7543       BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7544       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7545       V1 = DAG.getBitcast(BlendVT, V1);
7546       V2 = DAG.getBitcast(BlendVT, V2);
7547       return DAG.getBitcast(
7548           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7549                           DAG.getConstant(BlendMask, DL, MVT::i8)));
7550     }
7551     // FALLTHROUGH
7552   case MVT::v8i16: {
7553     // For integer shuffles we need to expand the mask and cast the inputs to
7554     // v8i16s prior to blending.
7555     int Scale = 8 / VT.getVectorNumElements();
7556     BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7557     V1 = DAG.getBitcast(MVT::v8i16, V1);
7558     V2 = DAG.getBitcast(MVT::v8i16, V2);
7559     return DAG.getBitcast(VT,
7560                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7561                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
7562   }
7563
7564   case MVT::v16i16: {
7565     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7566     SmallVector<int, 8> RepeatedMask;
7567     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7568       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7569       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7570       BlendMask = 0;
7571       for (int i = 0; i < 8; ++i)
7572         if (RepeatedMask[i] >= 8)
7573           BlendMask |= 1u << i;
7574       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7575                          DAG.getConstant(BlendMask, DL, MVT::i8));
7576     }
7577   }
7578     // FALLTHROUGH
7579   case MVT::v16i8:
7580   case MVT::v32i8: {
7581     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
7582            "256-bit byte-blends require AVX2 support!");
7583
7584     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
7585     if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
7586       return Masked;
7587
7588     // Scale the blend by the number of bytes per element.
7589     int Scale = VT.getScalarSizeInBits() / 8;
7590
7591     // This form of blend is always done on bytes. Compute the byte vector
7592     // type.
7593     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
7594
7595     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7596     // mix of LLVM's code generator and the x86 backend. We tell the code
7597     // generator that boolean values in the elements of an x86 vector register
7598     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7599     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7600     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7601     // of the element (the remaining are ignored) and 0 in that high bit would
7602     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7603     // the LLVM model for boolean values in vector elements gets the relevant
7604     // bit set, it is set backwards and over constrained relative to x86's
7605     // actual model.
7606     SmallVector<SDValue, 32> VSELECTMask;
7607     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7608       for (int j = 0; j < Scale; ++j)
7609         VSELECTMask.push_back(
7610             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7611                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
7612                                           MVT::i8));
7613
7614     V1 = DAG.getBitcast(BlendVT, V1);
7615     V2 = DAG.getBitcast(BlendVT, V2);
7616     return DAG.getBitcast(
7617         VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
7618                         DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
7619   }
7620
7621   default:
7622     llvm_unreachable("Not a supported integer vector type!");
7623   }
7624 }
7625
7626 /// \brief Try to lower as a blend of elements from two inputs followed by
7627 /// a single-input permutation.
7628 ///
7629 /// This matches the pattern where we can blend elements from two inputs and
7630 /// then reduce the shuffle to a single-input permutation.
7631 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
7632                                                    SDValue V1, SDValue V2,
7633                                                    ArrayRef<int> Mask,
7634                                                    SelectionDAG &DAG) {
7635   // We build up the blend mask while checking whether a blend is a viable way
7636   // to reduce the shuffle.
7637   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7638   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
7639
7640   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7641     if (Mask[i] < 0)
7642       continue;
7643
7644     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
7645
7646     if (BlendMask[Mask[i] % Size] < 0)
7647       BlendMask[Mask[i] % Size] = Mask[i];
7648     else if (BlendMask[Mask[i] % Size] != Mask[i])
7649       return SDValue(); // Can't blend in the needed input!
7650
7651     PermuteMask[i] = Mask[i] % Size;
7652   }
7653
7654   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7655   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
7656 }
7657
7658 /// \brief Generic routine to decompose a shuffle and blend into indepndent
7659 /// blends and permutes.
7660 ///
7661 /// This matches the extremely common pattern for handling combined
7662 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7663 /// operations. It will try to pick the best arrangement of shuffles and
7664 /// blends.
7665 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
7666                                                           MVT VT, SDValue V1,
7667                                                           SDValue V2,
7668                                                           ArrayRef<int> Mask,
7669                                                           SelectionDAG &DAG) {
7670   // Shuffle the input elements into the desired positions in V1 and V2 and
7671   // blend them together.
7672   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7673   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7674   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7675   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7676     if (Mask[i] >= 0 && Mask[i] < Size) {
7677       V1Mask[i] = Mask[i];
7678       BlendMask[i] = i;
7679     } else if (Mask[i] >= Size) {
7680       V2Mask[i] = Mask[i] - Size;
7681       BlendMask[i] = i + Size;
7682     }
7683
7684   // Try to lower with the simpler initial blend strategy unless one of the
7685   // input shuffles would be a no-op. We prefer to shuffle inputs as the
7686   // shuffle may be able to fold with a load or other benefit. However, when
7687   // we'll have to do 2x as many shuffles in order to achieve this, blending
7688   // first is a better strategy.
7689   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
7690     if (SDValue BlendPerm =
7691             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
7692       return BlendPerm;
7693
7694   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7695   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7696   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7697 }
7698
7699 /// \brief Try to lower a vector shuffle as a byte rotation.
7700 ///
7701 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7702 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7703 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7704 /// try to generically lower a vector shuffle through such an pattern. It
7705 /// does not check for the profitability of lowering either as PALIGNR or
7706 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7707 /// This matches shuffle vectors that look like:
7708 ///
7709 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7710 ///
7711 /// Essentially it concatenates V1 and V2, shifts right by some number of
7712 /// elements, and takes the low elements as the result. Note that while this is
7713 /// specified as a *right shift* because x86 is little-endian, it is a *left
7714 /// rotate* of the vector lanes.
7715 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
7716                                               SDValue V1, SDValue V2,
7717                                               ArrayRef<int> Mask,
7718                                               const X86Subtarget &Subtarget,
7719                                               SelectionDAG &DAG) {
7720   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7721
7722   int NumElts = Mask.size();
7723   int NumLanes = VT.getSizeInBits() / 128;
7724   int NumLaneElts = NumElts / NumLanes;
7725
7726   // We need to detect various ways of spelling a rotation:
7727   //   [11, 12, 13, 14, 15,  0,  1,  2]
7728   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7729   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7730   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7731   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7732   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7733   int Rotation = 0;
7734   SDValue Lo, Hi;
7735   for (int l = 0; l < NumElts; l += NumLaneElts) {
7736     for (int i = 0; i < NumLaneElts; ++i) {
7737       if (Mask[l + i] < 0)
7738         continue;
7739
7740       // Get the mod-Size index and lane correct it.
7741       int LaneIdx = (Mask[l + i] % NumElts) - l;
7742       // Make sure it was in this lane.
7743       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
7744         return SDValue();
7745
7746       // Determine where a rotated vector would have started.
7747       int StartIdx = i - LaneIdx;
7748       if (StartIdx == 0)
7749         // The identity rotation isn't interesting, stop.
7750         return SDValue();
7751
7752       // If we found the tail of a vector the rotation must be the missing
7753       // front. If we found the head of a vector, it must be how much of the
7754       // head.
7755       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
7756
7757       if (Rotation == 0)
7758         Rotation = CandidateRotation;
7759       else if (Rotation != CandidateRotation)
7760         // The rotations don't match, so we can't match this mask.
7761         return SDValue();
7762
7763       // Compute which value this mask is pointing at.
7764       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
7765
7766       // Compute which of the two target values this index should be assigned
7767       // to. This reflects whether the high elements are remaining or the low
7768       // elements are remaining.
7769       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7770
7771       // Either set up this value if we've not encountered it before, or check
7772       // that it remains consistent.
7773       if (!TargetV)
7774         TargetV = MaskV;
7775       else if (TargetV != MaskV)
7776         // This may be a rotation, but it pulls from the inputs in some
7777         // unsupported interleaving.
7778         return SDValue();
7779     }
7780   }
7781
7782   // Check that we successfully analyzed the mask, and normalize the results.
7783   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7784   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7785   if (!Lo)
7786     Lo = Hi;
7787   else if (!Hi)
7788     Hi = Lo;
7789
7790   // Cast the inputs to i8 vector of correct length to match PALIGNR or
7791   // PSLLDQ/PSRLDQ.
7792   MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
7793   Lo = DAG.getBitcast(ByteVT, Lo);
7794   Hi = DAG.getBitcast(ByteVT, Hi);
7795
7796   // The actual rotate instruction rotates bytes, so we need to scale the
7797   // rotation based on how many bytes are in the vector lane.
7798   int Scale = 16 / NumLaneElts;
7799
7800   // SSSE3 targets can use the palignr instruction.
7801   if (Subtarget.hasSSSE3()) {
7802     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
7803            "512-bit PALIGNR requires BWI instructions");
7804     return DAG.getBitcast(
7805         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
7806                         DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
7807   }
7808
7809   assert(VT.is128BitVector() &&
7810          "Rotate-based lowering only supports 128-bit lowering!");
7811   assert(Mask.size() <= 16 &&
7812          "Can shuffle at most 16 bytes in a 128-bit vector!");
7813   assert(ByteVT == MVT::v16i8 &&
7814          "SSE2 rotate lowering only needed for v16i8!");
7815
7816   // Default SSE2 implementation
7817   int LoByteShift = 16 - Rotation * Scale;
7818   int HiByteShift = Rotation * Scale;
7819
7820   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
7821                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
7822   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
7823                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
7824   return DAG.getBitcast(VT,
7825                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
7826 }
7827
7828 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7829 ///
7830 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
7831 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
7832 /// matches elements from one of the input vectors shuffled to the left or
7833 /// right with zeroable elements 'shifted in'. It handles both the strictly
7834 /// bit-wise element shifts and the byte shift across an entire 128-bit double
7835 /// quad word lane.
7836 ///
7837 /// PSHL : (little-endian) left bit shift.
7838 /// [ zz, 0, zz,  2 ]
7839 /// [ -1, 4, zz, -1 ]
7840 /// PSRL : (little-endian) right bit shift.
7841 /// [  1, zz,  3, zz]
7842 /// [ -1, -1,  7, zz]
7843 /// PSLLDQ : (little-endian) left byte shift
7844 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
7845 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
7846 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
7847 /// PSRLDQ : (little-endian) right byte shift
7848 /// [  5, 6,  7, zz, zz, zz, zz, zz]
7849 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
7850 /// [  1, 2, -1, -1, -1, -1, zz, zz]
7851 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
7852                                          SDValue V2, ArrayRef<int> Mask,
7853                                          const X86Subtarget &Subtarget,
7854                                          SelectionDAG &DAG) {
7855   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7856
7857   int Size = Mask.size();
7858   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7859
7860   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
7861     for (int i = 0; i < Size; i += Scale)
7862       for (int j = 0; j < Shift; ++j)
7863         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
7864           return false;
7865
7866     return true;
7867   };
7868
7869   auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
7870     for (int i = 0; i != Size; i += Scale) {
7871       unsigned Pos = Left ? i + Shift : i;
7872       unsigned Low = Left ? i : i + Shift;
7873       unsigned Len = Scale - Shift;
7874       if (!isSequentialOrUndefInRange(Mask, Pos, Len,
7875                                       Low + (V == V1 ? 0 : Size)))
7876         return SDValue();
7877     }
7878
7879     int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
7880     bool ByteShift = ShiftEltBits > 64;
7881     unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
7882                            : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
7883     int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
7884
7885     // Normalize the scale for byte shifts to still produce an i64 element
7886     // type.
7887     Scale = ByteShift ? Scale / 2 : Scale;
7888
7889     // We need to round trip through the appropriate type for the shift.
7890     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7891     MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
7892                             : MVT::getVectorVT(ShiftSVT, Size / Scale);
7893     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7894            "Illegal integer vector type");
7895     V = DAG.getBitcast(ShiftVT, V);
7896
7897     V = DAG.getNode(OpCode, DL, ShiftVT, V,
7898                     DAG.getConstant(ShiftAmt, DL, MVT::i8));
7899     return DAG.getBitcast(VT, V);
7900   };
7901
7902   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7903   // keep doubling the size of the integer elements up to that. We can
7904   // then shift the elements of the integer vector by whole multiples of
7905   // their width within the elements of the larger integer vector. Test each
7906   // multiple to see if we can find a match with the moved element indices
7907   // and that the shifted in elements are all zeroable.
7908   unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
7909   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
7910     for (int Shift = 1; Shift != Scale; ++Shift)
7911       for (bool Left : {true, false})
7912         if (CheckZeros(Shift, Scale, Left))
7913           for (SDValue V : {V1, V2})
7914             if (SDValue Match = MatchShift(Shift, Scale, Left, V))
7915               return Match;
7916
7917   // no match
7918   return SDValue();
7919 }
7920
7921 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
7922 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
7923                                            SDValue V2, ArrayRef<int> Mask,
7924                                            SelectionDAG &DAG) {
7925   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7926   assert(!Zeroable.all() && "Fully zeroable shuffle mask");
7927
7928   int Size = Mask.size();
7929   int HalfSize = Size / 2;
7930   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7931
7932   // Upper half must be undefined.
7933   if (!isUndefInRange(Mask, HalfSize, HalfSize))
7934     return SDValue();
7935
7936   // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
7937   // Remainder of lower half result is zero and upper half is all undef.
7938   auto LowerAsEXTRQ = [&]() {
7939     // Determine the extraction length from the part of the
7940     // lower half that isn't zeroable.
7941     int Len = HalfSize;
7942     for (; Len > 0; --Len)
7943       if (!Zeroable[Len - 1])
7944         break;
7945     assert(Len > 0 && "Zeroable shuffle mask");
7946
7947     // Attempt to match first Len sequential elements from the lower half.
7948     SDValue Src;
7949     int Idx = -1;
7950     for (int i = 0; i != Len; ++i) {
7951       int M = Mask[i];
7952       if (M < 0)
7953         continue;
7954       SDValue &V = (M < Size ? V1 : V2);
7955       M = M % Size;
7956
7957       // The extracted elements must start at a valid index and all mask
7958       // elements must be in the lower half.
7959       if (i > M || M >= HalfSize)
7960         return SDValue();
7961
7962       if (Idx < 0 || (Src == V && Idx == (M - i))) {
7963         Src = V;
7964         Idx = M - i;
7965         continue;
7966       }
7967       return SDValue();
7968     }
7969
7970     if (Idx < 0)
7971       return SDValue();
7972
7973     assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
7974     int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
7975     int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
7976     return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
7977                        DAG.getConstant(BitLen, DL, MVT::i8),
7978                        DAG.getConstant(BitIdx, DL, MVT::i8));
7979   };
7980
7981   if (SDValue ExtrQ = LowerAsEXTRQ())
7982     return ExtrQ;
7983
7984   // INSERTQ: Extract lowest Len elements from lower half of second source and
7985   // insert over first source, starting at Idx.
7986   // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
7987   auto LowerAsInsertQ = [&]() {
7988     for (int Idx = 0; Idx != HalfSize; ++Idx) {
7989       SDValue Base;
7990
7991       // Attempt to match first source from mask before insertion point.
7992       if (isUndefInRange(Mask, 0, Idx)) {
7993         /* EMPTY */
7994       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
7995         Base = V1;
7996       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
7997         Base = V2;
7998       } else {
7999         continue;
8000       }
8001
8002       // Extend the extraction length looking to match both the insertion of
8003       // the second source and the remaining elements of the first.
8004       for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8005         SDValue Insert;
8006         int Len = Hi - Idx;
8007
8008         // Match insertion.
8009         if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8010           Insert = V1;
8011         } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8012           Insert = V2;
8013         } else {
8014           continue;
8015         }
8016
8017         // Match the remaining elements of the lower half.
8018         if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8019           /* EMPTY */
8020         } else if ((!Base || (Base == V1)) &&
8021                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8022           Base = V1;
8023         } else if ((!Base || (Base == V2)) &&
8024                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8025                                               Size + Hi)) {
8026           Base = V2;
8027         } else {
8028           continue;
8029         }
8030
8031         // We may not have a base (first source) - this can safely be undefined.
8032         if (!Base)
8033           Base = DAG.getUNDEF(VT);
8034
8035         int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8036         int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8037         return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8038                            DAG.getConstant(BitLen, DL, MVT::i8),
8039                            DAG.getConstant(BitIdx, DL, MVT::i8));
8040       }
8041     }
8042
8043     return SDValue();
8044   };
8045
8046   if (SDValue InsertQ = LowerAsInsertQ())
8047     return InsertQ;
8048
8049   return SDValue();
8050 }
8051
8052 /// \brief Lower a vector shuffle as a zero or any extension.
8053 ///
8054 /// Given a specific number of elements, element bit width, and extension
8055 /// stride, produce either a zero or any extension based on the available
8056 /// features of the subtarget. The extended elements are consecutive and
8057 /// begin and can start from an offseted element index in the input; to
8058 /// avoid excess shuffling the offset must either being in the bottom lane
8059 /// or at the start of a higher lane. All extended elements must be from
8060 /// the same lane.
8061 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8062     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8063     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8064   assert(Scale > 1 && "Need a scale to extend.");
8065   int EltBits = VT.getScalarSizeInBits();
8066   int NumElements = VT.getVectorNumElements();
8067   int NumEltsPerLane = 128 / EltBits;
8068   int OffsetLane = Offset / NumEltsPerLane;
8069   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8070          "Only 8, 16, and 32 bit elements can be extended.");
8071   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8072   assert(0 <= Offset && "Extension offset must be positive.");
8073   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8074          "Extension offset must be in the first lane or start an upper lane.");
8075
8076   // Check that an index is in same lane as the base offset.
8077   auto SafeOffset = [&](int Idx) {
8078     return OffsetLane == (Idx / NumEltsPerLane);
8079   };
8080
8081   // Shift along an input so that the offset base moves to the first element.
8082   auto ShuffleOffset = [&](SDValue V) {
8083     if (!Offset)
8084       return V;
8085
8086     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8087     for (int i = 0; i * Scale < NumElements; ++i) {
8088       int SrcIdx = i + Offset;
8089       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8090     }
8091     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8092   };
8093
8094   // Found a valid zext mask! Try various lowering strategies based on the
8095   // input type and available ISA extensions.
8096   if (Subtarget.hasSSE41()) {
8097     // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8098     // PUNPCK will catch this in a later shuffle match.
8099     if (Offset && Scale == 2 && VT.is128BitVector())
8100       return SDValue();
8101     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8102                                  NumElements / Scale);
8103     InputV = ShuffleOffset(InputV);
8104
8105     // For 256-bit vectors, we only need the lower (128-bit) input half.
8106     if (VT.is256BitVector())
8107       InputV = extract128BitVector(InputV, 0, DAG, DL);
8108
8109     InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8110     return DAG.getBitcast(VT, InputV);
8111   }
8112
8113   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8114
8115   // For any extends we can cheat for larger element sizes and use shuffle
8116   // instructions that can fold with a load and/or copy.
8117   if (AnyExt && EltBits == 32) {
8118     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8119                          -1};
8120     return DAG.getBitcast(
8121         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8122                         DAG.getBitcast(MVT::v4i32, InputV),
8123                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8124   }
8125   if (AnyExt && EltBits == 16 && Scale > 2) {
8126     int PSHUFDMask[4] = {Offset / 2, -1,
8127                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8128     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8129                          DAG.getBitcast(MVT::v4i32, InputV),
8130                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8131     int PSHUFWMask[4] = {1, -1, -1, -1};
8132     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8133     return DAG.getBitcast(
8134         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
8135                         DAG.getBitcast(MVT::v8i16, InputV),
8136                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
8137   }
8138
8139   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
8140   // to 64-bits.
8141   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
8142     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
8143     assert(VT.is128BitVector() && "Unexpected vector width!");
8144
8145     int LoIdx = Offset * EltBits;
8146     SDValue Lo = DAG.getBitcast(
8147         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8148                                 DAG.getConstant(EltBits, DL, MVT::i8),
8149                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
8150
8151     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
8152         !SafeOffset(Offset + 1))
8153       return DAG.getBitcast(VT, Lo);
8154
8155     int HiIdx = (Offset + 1) * EltBits;
8156     SDValue Hi = DAG.getBitcast(
8157         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8158                                 DAG.getConstant(EltBits, DL, MVT::i8),
8159                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
8160     return DAG.getBitcast(VT,
8161                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
8162   }
8163
8164   // If this would require more than 2 unpack instructions to expand, use
8165   // pshufb when available. We can only use more than 2 unpack instructions
8166   // when zero extending i8 elements which also makes it easier to use pshufb.
8167   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
8168     assert(NumElements == 16 && "Unexpected byte vector width!");
8169     SDValue PSHUFBMask[16];
8170     for (int i = 0; i < 16; ++i) {
8171       int Idx = Offset + (i / Scale);
8172       PSHUFBMask[i] = DAG.getConstant(
8173           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
8174     }
8175     InputV = DAG.getBitcast(MVT::v16i8, InputV);
8176     return DAG.getBitcast(
8177         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8178                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
8179   }
8180
8181   // If we are extending from an offset, ensure we start on a boundary that
8182   // we can unpack from.
8183   int AlignToUnpack = Offset % (NumElements / Scale);
8184   if (AlignToUnpack) {
8185     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8186     for (int i = AlignToUnpack; i < NumElements; ++i)
8187       ShMask[i - AlignToUnpack] = i;
8188     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
8189     Offset -= AlignToUnpack;
8190   }
8191
8192   // Otherwise emit a sequence of unpacks.
8193   do {
8194     unsigned UnpackLoHi = X86ISD::UNPCKL;
8195     if (Offset >= (NumElements / 2)) {
8196       UnpackLoHi = X86ISD::UNPCKH;
8197       Offset -= (NumElements / 2);
8198     }
8199
8200     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8201     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8202                          : getZeroVector(InputVT, Subtarget, DAG, DL);
8203     InputV = DAG.getBitcast(InputVT, InputV);
8204     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
8205     Scale /= 2;
8206     EltBits *= 2;
8207     NumElements /= 2;
8208   } while (Scale > 1);
8209   return DAG.getBitcast(VT, InputV);
8210 }
8211
8212 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8213 ///
8214 /// This routine will try to do everything in its power to cleverly lower
8215 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8216 /// check for the profitability of this lowering,  it tries to aggressively
8217 /// match this pattern. It will use all of the micro-architectural details it
8218 /// can to emit an efficient lowering. It handles both blends with all-zero
8219 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8220 /// masking out later).
8221 ///
8222 /// The reason we have dedicated lowering for zext-style shuffles is that they
8223 /// are both incredibly common and often quite performance sensitive.
8224 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8225     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8226     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8227   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8228
8229   int Bits = VT.getSizeInBits();
8230   int NumLanes = Bits / 128;
8231   int NumElements = VT.getVectorNumElements();
8232   int NumEltsPerLane = NumElements / NumLanes;
8233   assert(VT.getScalarSizeInBits() <= 32 &&
8234          "Exceeds 32-bit integer zero extension limit");
8235   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8236
8237   // Define a helper function to check a particular ext-scale and lower to it if
8238   // valid.
8239   auto Lower = [&](int Scale) -> SDValue {
8240     SDValue InputV;
8241     bool AnyExt = true;
8242     int Offset = 0;
8243     int Matches = 0;
8244     for (int i = 0; i < NumElements; ++i) {
8245       int M = Mask[i];
8246       if (M < 0)
8247         continue; // Valid anywhere but doesn't tell us anything.
8248       if (i % Scale != 0) {
8249         // Each of the extended elements need to be zeroable.
8250         if (!Zeroable[i])
8251           return SDValue();
8252
8253         // We no longer are in the anyext case.
8254         AnyExt = false;
8255         continue;
8256       }
8257
8258       // Each of the base elements needs to be consecutive indices into the
8259       // same input vector.
8260       SDValue V = M < NumElements ? V1 : V2;
8261       M = M % NumElements;
8262       if (!InputV) {
8263         InputV = V;
8264         Offset = M - (i / Scale);
8265       } else if (InputV != V)
8266         return SDValue(); // Flip-flopping inputs.
8267
8268       // Offset must start in the lowest 128-bit lane or at the start of an
8269       // upper lane.
8270       // FIXME: Is it ever worth allowing a negative base offset?
8271       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
8272             (Offset % NumEltsPerLane) == 0))
8273         return SDValue();
8274
8275       // If we are offsetting, all referenced entries must come from the same
8276       // lane.
8277       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
8278         return SDValue();
8279
8280       if ((M % NumElements) != (Offset + (i / Scale)))
8281         return SDValue(); // Non-consecutive strided elements.
8282       Matches++;
8283     }
8284
8285     // If we fail to find an input, we have a zero-shuffle which should always
8286     // have already been handled.
8287     // FIXME: Maybe handle this here in case during blending we end up with one?
8288     if (!InputV)
8289       return SDValue();
8290
8291     // If we are offsetting, don't extend if we only match a single input, we
8292     // can always do better by using a basic PSHUF or PUNPCK.
8293     if (Offset != 0 && Matches < 2)
8294       return SDValue();
8295
8296     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8297         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
8298   };
8299
8300   // The widest scale possible for extending is to a 64-bit integer.
8301   assert(Bits % 64 == 0 &&
8302          "The number of bits in a vector must be divisible by 64 on x86!");
8303   int NumExtElements = Bits / 64;
8304
8305   // Each iteration, try extending the elements half as much, but into twice as
8306   // many elements.
8307   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8308     assert(NumElements % NumExtElements == 0 &&
8309            "The input vector size must be divisible by the extended size.");
8310     if (SDValue V = Lower(NumElements / NumExtElements))
8311       return V;
8312   }
8313
8314   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8315   if (Bits != 128)
8316     return SDValue();
8317
8318   // Returns one of the source operands if the shuffle can be reduced to a
8319   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8320   auto CanZExtLowHalf = [&]() {
8321     for (int i = NumElements / 2; i != NumElements; ++i)
8322       if (!Zeroable[i])
8323         return SDValue();
8324     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8325       return V1;
8326     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8327       return V2;
8328     return SDValue();
8329   };
8330
8331   if (SDValue V = CanZExtLowHalf()) {
8332     V = DAG.getBitcast(MVT::v2i64, V);
8333     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8334     return DAG.getBitcast(VT, V);
8335   }
8336
8337   // No viable ext lowering found.
8338   return SDValue();
8339 }
8340
8341 /// \brief Try to get a scalar value for a specific element of a vector.
8342 ///
8343 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8344 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8345                                               SelectionDAG &DAG) {
8346   MVT VT = V.getSimpleValueType();
8347   MVT EltVT = VT.getVectorElementType();
8348   V = peekThroughBitcasts(V);
8349
8350   // If the bitcasts shift the element size, we can't extract an equivalent
8351   // element from it.
8352   MVT NewVT = V.getSimpleValueType();
8353   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8354     return SDValue();
8355
8356   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8357       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
8358     // Ensure the scalar operand is the same size as the destination.
8359     // FIXME: Add support for scalar truncation where possible.
8360     SDValue S = V.getOperand(Idx);
8361     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
8362       return DAG.getBitcast(EltVT, S);
8363   }
8364
8365   return SDValue();
8366 }
8367
8368 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8369 ///
8370 /// This is particularly important because the set of instructions varies
8371 /// significantly based on whether the operand is a load or not.
8372 static bool isShuffleFoldableLoad(SDValue V) {
8373   V = peekThroughBitcasts(V);
8374   return ISD::isNON_EXTLoad(V.getNode());
8375 }
8376
8377 /// \brief Try to lower insertion of a single element into a zero vector.
8378 ///
8379 /// This is a common pattern that we have especially efficient patterns to lower
8380 /// across all subtarget feature sets.
8381 static SDValue lowerVectorShuffleAsElementInsertion(
8382     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8383     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8384   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8385   MVT ExtVT = VT;
8386   MVT EltVT = VT.getVectorElementType();
8387
8388   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8389                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8390                 Mask.begin();
8391   bool IsV1Zeroable = true;
8392   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8393     if (i != V2Index && !Zeroable[i]) {
8394       IsV1Zeroable = false;
8395       break;
8396     }
8397
8398   // Check for a single input from a SCALAR_TO_VECTOR node.
8399   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8400   // all the smarts here sunk into that routine. However, the current
8401   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8402   // vector shuffle lowering is dead.
8403   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
8404                                                DAG);
8405   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
8406     // We need to zext the scalar if it is smaller than an i32.
8407     V2S = DAG.getBitcast(EltVT, V2S);
8408     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8409       // Using zext to expand a narrow element won't work for non-zero
8410       // insertions.
8411       if (!IsV1Zeroable)
8412         return SDValue();
8413
8414       // Zero-extend directly to i32.
8415       ExtVT = MVT::v4i32;
8416       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8417     }
8418     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8419   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8420              EltVT == MVT::i16) {
8421     // Either not inserting from the low element of the input or the input
8422     // element size is too small to use VZEXT_MOVL to clear the high bits.
8423     return SDValue();
8424   }
8425
8426   if (!IsV1Zeroable) {
8427     // If V1 can't be treated as a zero vector we have fewer options to lower
8428     // this. We can't support integer vectors or non-zero targets cheaply, and
8429     // the V1 elements can't be permuted in any way.
8430     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8431     if (!VT.isFloatingPoint() || V2Index != 0)
8432       return SDValue();
8433     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8434     V1Mask[V2Index] = -1;
8435     if (!isNoopShuffleMask(V1Mask))
8436       return SDValue();
8437     // This is essentially a special case blend operation, but if we have
8438     // general purpose blend operations, they are always faster. Bail and let
8439     // the rest of the lowering handle these as blends.
8440     if (Subtarget.hasSSE41())
8441       return SDValue();
8442
8443     // Otherwise, use MOVSD or MOVSS.
8444     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8445            "Only two types of floating point element types to handle!");
8446     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8447                        ExtVT, V1, V2);
8448   }
8449
8450   // This lowering only works for the low element with floating point vectors.
8451   if (VT.isFloatingPoint() && V2Index != 0)
8452     return SDValue();
8453
8454   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8455   if (ExtVT != VT)
8456     V2 = DAG.getBitcast(VT, V2);
8457
8458   if (V2Index != 0) {
8459     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8460     // the desired position. Otherwise it is more efficient to do a vector
8461     // shift left. We know that we can do a vector shift left because all
8462     // the inputs are zero.
8463     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8464       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8465       V2Shuffle[V2Index] = 0;
8466       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8467     } else {
8468       V2 = DAG.getBitcast(MVT::v16i8, V2);
8469       V2 = DAG.getNode(
8470           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
8471           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
8472                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
8473                               DAG.getDataLayout(), VT)));
8474       V2 = DAG.getBitcast(VT, V2);
8475     }
8476   }
8477   return V2;
8478 }
8479
8480 /// Try to lower broadcast of a single - truncated - integer element,
8481 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
8482 ///
8483 /// This assumes we have AVX2.
8484 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
8485                                                   SDValue V0, int BroadcastIdx,
8486                                                   const X86Subtarget &Subtarget,
8487                                                   SelectionDAG &DAG) {
8488   assert(Subtarget.hasAVX2() &&
8489          "We can only lower integer broadcasts with AVX2!");
8490
8491   EVT EltVT = VT.getVectorElementType();
8492   EVT V0VT = V0.getValueType();
8493
8494   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
8495   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
8496
8497   EVT V0EltVT = V0VT.getVectorElementType();
8498   if (!V0EltVT.isInteger())
8499     return SDValue();
8500
8501   const unsigned EltSize = EltVT.getSizeInBits();
8502   const unsigned V0EltSize = V0EltVT.getSizeInBits();
8503
8504   // This is only a truncation if the original element type is larger.
8505   if (V0EltSize <= EltSize)
8506     return SDValue();
8507
8508   assert(((V0EltSize % EltSize) == 0) &&
8509          "Scalar type sizes must all be powers of 2 on x86!");
8510
8511   const unsigned V0Opc = V0.getOpcode();
8512   const unsigned Scale = V0EltSize / EltSize;
8513   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
8514
8515   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
8516       V0Opc != ISD::BUILD_VECTOR)
8517     return SDValue();
8518
8519   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
8520
8521   // If we're extracting non-least-significant bits, shift so we can truncate.
8522   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
8523   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
8524   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
8525   if (const int OffsetIdx = BroadcastIdx % Scale)
8526     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
8527             DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
8528
8529   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
8530                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
8531 }
8532
8533 /// \brief Try to lower broadcast of a single element.
8534 ///
8535 /// For convenience, this code also bundles all of the subtarget feature set
8536 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8537 /// a convenient way to factor it out.
8538 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
8539 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
8540                                              SDValue V1, SDValue V2,
8541                                              ArrayRef<int> Mask,
8542                                              const X86Subtarget &Subtarget,
8543                                              SelectionDAG &DAG) {
8544   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
8545         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
8546         (Subtarget.hasAVX2() && VT.isInteger())))
8547     return SDValue();
8548
8549   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
8550   // we can only broadcast from a register with AVX2.
8551   unsigned NumElts = Mask.size();
8552   unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
8553   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
8554
8555   // Check that the mask is a broadcast.
8556   int BroadcastIdx = -1;
8557   for (int i = 0; i != (int)NumElts; ++i) {
8558     SmallVector<int, 8> BroadcastMask(NumElts, i);
8559     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
8560       BroadcastIdx = i;
8561       break;
8562     }
8563   }
8564
8565   if (BroadcastIdx < 0)
8566     return SDValue();
8567   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8568                                             "a sorted mask where the broadcast "
8569                                             "comes from V1.");
8570
8571   // Go up the chain of (vector) values to find a scalar load that we can
8572   // combine with the broadcast.
8573   SDValue V = V1;
8574   for (;;) {
8575     switch (V.getOpcode()) {
8576     case ISD::BITCAST: {
8577       SDValue VSrc = V.getOperand(0);
8578       MVT SrcVT = VSrc.getSimpleValueType();
8579       if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
8580         break;
8581       V = VSrc;
8582       continue;
8583     }
8584     case ISD::CONCAT_VECTORS: {
8585       int OperandSize = Mask.size() / V.getNumOperands();
8586       V = V.getOperand(BroadcastIdx / OperandSize);
8587       BroadcastIdx %= OperandSize;
8588       continue;
8589     }
8590     case ISD::INSERT_SUBVECTOR: {
8591       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8592       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8593       if (!ConstantIdx)
8594         break;
8595
8596       int BeginIdx = (int)ConstantIdx->getZExtValue();
8597       int EndIdx =
8598           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
8599       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8600         BroadcastIdx -= BeginIdx;
8601         V = VInner;
8602       } else {
8603         V = VOuter;
8604       }
8605       continue;
8606     }
8607     }
8608     break;
8609   }
8610
8611   // Check if this is a broadcast of a scalar. We special case lowering
8612   // for scalars so that we can more effectively fold with loads.
8613   // First, look through bitcast: if the original value has a larger element
8614   // type than the shuffle, the broadcast element is in essence truncated.
8615   // Make that explicit to ease folding.
8616   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
8617     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
8618             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
8619       return TruncBroadcast;
8620
8621   MVT BroadcastVT = VT;
8622
8623   // Peek through any bitcast (only useful for loads).
8624   SDValue BC = peekThroughBitcasts(V);
8625
8626   // Also check the simpler case, where we can directly reuse the scalar.
8627   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8628       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8629     V = V.getOperand(BroadcastIdx);
8630
8631     // If we can't broadcast from a register, check that the input is a load.
8632     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
8633       return SDValue();
8634   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
8635     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
8636     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
8637       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
8638       Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
8639     }
8640
8641     // If we are broadcasting a load that is only used by the shuffle
8642     // then we can reduce the vector load to the broadcasted scalar load.
8643     LoadSDNode *Ld = cast<LoadSDNode>(BC);
8644     SDValue BaseAddr = Ld->getOperand(1);
8645     EVT SVT = BroadcastVT.getScalarType();
8646     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
8647     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
8648     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
8649                     DAG.getMachineFunction().getMachineMemOperand(
8650                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
8651   } else if (!BroadcastFromReg) {
8652     // We can't broadcast from a vector register.
8653     return SDValue();
8654   } else if (BroadcastIdx != 0) {
8655     // We can only broadcast from the zero-element of a vector register,
8656     // but it can be advantageous to broadcast from the zero-element of a
8657     // subvector.
8658     if (!VT.is256BitVector() && !VT.is512BitVector())
8659       return SDValue();
8660
8661     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
8662     if (VT == MVT::v4f64 || VT == MVT::v4i64)
8663       return SDValue();
8664
8665     // Only broadcast the zero-element of a 128-bit subvector.
8666     unsigned EltSize = VT.getScalarSizeInBits();
8667     if (((BroadcastIdx * EltSize) % 128) != 0)
8668       return SDValue();
8669
8670     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
8671     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
8672                     DAG.getIntPtrConstant(BroadcastIdx, DL));
8673   }
8674
8675   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
8676     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
8677                     DAG.getBitcast(MVT::f64, V));
8678
8679   // Bitcast back to the same scalar type as BroadcastVT.
8680   MVT SrcVT = V.getSimpleValueType();
8681   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
8682     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
8683            "Unexpected vector element size");
8684     if (SrcVT.isVector()) {
8685       unsigned NumSrcElts = SrcVT.getVectorNumElements();
8686       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
8687     } else {
8688       SrcVT = BroadcastVT.getScalarType();
8689     }
8690     V = DAG.getBitcast(SrcVT, V);
8691   }
8692
8693   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
8694 }
8695
8696 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8697 // INSERTPS when the V1 elements are already in the correct locations
8698 // because otherwise we can just always use two SHUFPS instructions which
8699 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8700 // perform INSERTPS if a single V1 element is out of place and all V2
8701 // elements are zeroable.
8702 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
8703                                          unsigned &InsertPSMask,
8704                                          const SmallBitVector &Zeroable,
8705                                          ArrayRef<int> Mask,
8706                                          SelectionDAG &DAG) {
8707   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
8708   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
8709   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8710   unsigned ZMask = 0;
8711   int V1DstIndex = -1;
8712   int V2DstIndex = -1;
8713   bool V1UsedInPlace = false;
8714
8715   for (int i = 0; i < 4; ++i) {
8716     // Synthesize a zero mask from the zeroable elements (includes undefs).
8717     if (Zeroable[i]) {
8718       ZMask |= 1 << i;
8719       continue;
8720     }
8721
8722     // Flag if we use any V1 inputs in place.
8723     if (i == Mask[i]) {
8724       V1UsedInPlace = true;
8725       continue;
8726     }
8727
8728     // We can only insert a single non-zeroable element.
8729     if (V1DstIndex >= 0 || V2DstIndex >= 0)
8730       return false;
8731
8732     if (Mask[i] < 4) {
8733       // V1 input out of place for insertion.
8734       V1DstIndex = i;
8735     } else {
8736       // V2 input for insertion.
8737       V2DstIndex = i;
8738     }
8739   }
8740
8741   // Don't bother if we have no (non-zeroable) element for insertion.
8742   if (V1DstIndex < 0 && V2DstIndex < 0)
8743     return false;
8744
8745   // Determine element insertion src/dst indices. The src index is from the
8746   // start of the inserted vector, not the start of the concatenated vector.
8747   unsigned V2SrcIndex = 0;
8748   if (V1DstIndex >= 0) {
8749     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8750     // and don't use the original V2 at all.
8751     V2SrcIndex = Mask[V1DstIndex];
8752     V2DstIndex = V1DstIndex;
8753     V2 = V1;
8754   } else {
8755     V2SrcIndex = Mask[V2DstIndex] - 4;
8756   }
8757
8758   // If no V1 inputs are used in place, then the result is created only from
8759   // the zero mask and the V2 insertion - so remove V1 dependency.
8760   if (!V1UsedInPlace)
8761     V1 = DAG.getUNDEF(MVT::v4f32);
8762
8763   // Insert the V2 element into the desired position.
8764   InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8765   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8766   return true;
8767 }
8768
8769 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
8770                                             SDValue V2, ArrayRef<int> Mask,
8771                                             SelectionDAG &DAG) {
8772   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8773   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8774   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8775
8776   // Attempt to match the insertps pattern.
8777   unsigned InsertPSMask;
8778   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
8779     return SDValue();
8780
8781   // Insert the V2 element into the desired position.
8782   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8783                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
8784 }
8785
8786 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
8787 /// UNPCK instruction.
8788 ///
8789 /// This specifically targets cases where we end up with alternating between
8790 /// the two inputs, and so can permute them into something that feeds a single
8791 /// UNPCK instruction. Note that this routine only targets integer vectors
8792 /// because for floating point vectors we have a generalized SHUFPS lowering
8793 /// strategy that handles everything that doesn't *exactly* match an unpack,
8794 /// making this clever lowering unnecessary.
8795 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
8796                                                     SDValue V1, SDValue V2,
8797                                                     ArrayRef<int> Mask,
8798                                                     SelectionDAG &DAG) {
8799   assert(!VT.isFloatingPoint() &&
8800          "This routine only supports integer vectors.");
8801   assert(VT.is128BitVector() &&
8802          "This routine only works on 128-bit vectors.");
8803   assert(!V2.isUndef() &&
8804          "This routine should only be used when blending two inputs.");
8805   assert(Mask.size() >= 2 && "Single element masks are invalid.");
8806
8807   int Size = Mask.size();
8808
8809   int NumLoInputs =
8810       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
8811   int NumHiInputs =
8812       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
8813
8814   bool UnpackLo = NumLoInputs >= NumHiInputs;
8815
8816   auto TryUnpack = [&](int ScalarSize, int Scale) {
8817     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
8818     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
8819
8820     for (int i = 0; i < Size; ++i) {
8821       if (Mask[i] < 0)
8822         continue;
8823
8824       // Each element of the unpack contains Scale elements from this mask.
8825       int UnpackIdx = i / Scale;
8826
8827       // We only handle the case where V1 feeds the first slots of the unpack.
8828       // We rely on canonicalization to ensure this is the case.
8829       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
8830         return SDValue();
8831
8832       // Setup the mask for this input. The indexing is tricky as we have to
8833       // handle the unpack stride.
8834       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
8835       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
8836           Mask[i] % Size;
8837     }
8838
8839     // If we will have to shuffle both inputs to use the unpack, check whether
8840     // we can just unpack first and shuffle the result. If so, skip this unpack.
8841     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
8842         !isNoopShuffleMask(V2Mask))
8843       return SDValue();
8844
8845     // Shuffle the inputs into place.
8846     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8847     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8848
8849     // Cast the inputs to the type we will use to unpack them.
8850     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
8851     V1 = DAG.getBitcast(UnpackVT, V1);
8852     V2 = DAG.getBitcast(UnpackVT, V2);
8853
8854     // Unpack the inputs and cast the result back to the desired type.
8855     return DAG.getBitcast(
8856         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
8857                         UnpackVT, V1, V2));
8858   };
8859
8860   // We try each unpack from the largest to the smallest to try and find one
8861   // that fits this mask.
8862   int OrigScalarSize = VT.getScalarSizeInBits();
8863   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
8864     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
8865       return Unpack;
8866
8867   // If none of the unpack-rooted lowerings worked (or were profitable) try an
8868   // initial unpack.
8869   if (NumLoInputs == 0 || NumHiInputs == 0) {
8870     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
8871            "We have to have *some* inputs!");
8872     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
8873
8874     // FIXME: We could consider the total complexity of the permute of each
8875     // possible unpacking. Or at the least we should consider how many
8876     // half-crossings are created.
8877     // FIXME: We could consider commuting the unpacks.
8878
8879     SmallVector<int, 32> PermMask((unsigned)Size, -1);
8880     for (int i = 0; i < Size; ++i) {
8881       if (Mask[i] < 0)
8882         continue;
8883
8884       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
8885
8886       PermMask[i] =
8887           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
8888     }
8889     return DAG.getVectorShuffle(
8890         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
8891                             DL, VT, V1, V2),
8892         DAG.getUNDEF(VT), PermMask);
8893   }
8894
8895   return SDValue();
8896 }
8897
8898 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8899 ///
8900 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8901 /// support for floating point shuffles but not integer shuffles. These
8902 /// instructions will incur a domain crossing penalty on some chips though so
8903 /// it is better to avoid lowering through this for integer vectors where
8904 /// possible.
8905 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8906                                        SDValue V1, SDValue V2,
8907                                        const X86Subtarget &Subtarget,
8908                                        SelectionDAG &DAG) {
8909   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8910   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8911   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8912
8913   if (V2.isUndef()) {
8914     // Check for being able to broadcast a single element.
8915     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8916             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8917       return Broadcast;
8918
8919     // Straight shuffle of a single input vector. Simulate this by using the
8920     // single input as both of the "inputs" to this instruction..
8921     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8922
8923     if (Subtarget.hasAVX()) {
8924       // If we have AVX, we can use VPERMILPS which will allow folding a load
8925       // into the shuffle.
8926       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8927                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8928     }
8929
8930     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
8931                        DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8932   }
8933   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8934   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8935
8936   // If we have a single input, insert that into V1 if we can do so cheaply.
8937   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8938     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8939             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8940       return Insertion;
8941     // Try inverting the insertion since for v2 masks it is easy to do and we
8942     // can't reliably sort the mask one way or the other.
8943     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8944                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8945     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8946             DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
8947       return Insertion;
8948   }
8949
8950   // Try to use one of the special instruction patterns to handle two common
8951   // blend patterns if a zero-blend above didn't work.
8952   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
8953       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
8954     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8955       // We can either use a special instruction to load over the low double or
8956       // to move just the low double.
8957       return DAG.getNode(
8958           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8959           DL, MVT::v2f64, V2,
8960           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8961
8962   if (Subtarget.hasSSE41())
8963     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8964                                                   Subtarget, DAG))
8965       return Blend;
8966
8967   // Use dedicated unpack instructions for masks that match their pattern.
8968   if (SDValue V =
8969           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
8970     return V;
8971
8972   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8973   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
8974                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8975 }
8976
8977 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8978 ///
8979 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8980 /// the integer unit to minimize domain crossing penalties. However, for blends
8981 /// it falls back to the floating point shuffle operation with appropriate bit
8982 /// casting.
8983 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8984                                        SDValue V1, SDValue V2,
8985                                        const X86Subtarget &Subtarget,
8986                                        SelectionDAG &DAG) {
8987   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8988   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8989   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8990
8991   if (V2.isUndef()) {
8992     // Check for being able to broadcast a single element.
8993     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8994             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8995       return Broadcast;
8996
8997     // Straight shuffle of a single input vector. For everything from SSE2
8998     // onward this has a single fast instruction with no scary immediates.
8999     // We have to map the mask as it is actually a v4i32 shuffle instruction.
9000     V1 = DAG.getBitcast(MVT::v4i32, V1);
9001     int WidenedMask[4] = {
9002         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9003         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9004     return DAG.getBitcast(
9005         MVT::v2i64,
9006         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9007                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9008   }
9009   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9010   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9011   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9012   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9013
9014   // If we have a blend of two same-type PACKUS operations and the blend aligns
9015   // with the low and high halves, we can just merge the PACKUS operations.
9016   // This is particularly important as it lets us merge shuffles that this
9017   // routine itself creates.
9018   auto GetPackNode = [](SDValue V) {
9019     V = peekThroughBitcasts(V);
9020     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9021   };
9022   if (SDValue V1Pack = GetPackNode(V1))
9023     if (SDValue V2Pack = GetPackNode(V2)) {
9024       EVT PackVT = V1Pack.getValueType();
9025       if (PackVT == V2Pack.getValueType())
9026         return DAG.getBitcast(MVT::v2i64,
9027                               DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9028                                           Mask[0] == 0 ? V1Pack.getOperand(0)
9029                                                        : V1Pack.getOperand(1),
9030                                           Mask[1] == 2 ? V2Pack.getOperand(0)
9031                                                        : V2Pack.getOperand(1)));
9032     }
9033
9034   // Try to use shift instructions.
9035   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9036                                                 Subtarget, DAG))
9037     return Shift;
9038
9039   // When loading a scalar and then shuffling it into a vector we can often do
9040   // the insertion cheaply.
9041   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9042           DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9043     return Insertion;
9044   // Try inverting the insertion since for v2 masks it is easy to do and we
9045   // can't reliably sort the mask one way or the other.
9046   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9047   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9048           DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
9049     return Insertion;
9050
9051   // We have different paths for blend lowering, but they all must use the
9052   // *exact* same predicate.
9053   bool IsBlendSupported = Subtarget.hasSSE41();
9054   if (IsBlendSupported)
9055     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9056                                                   Subtarget, DAG))
9057       return Blend;
9058
9059   // Use dedicated unpack instructions for masks that match their pattern.
9060   if (SDValue V =
9061           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9062     return V;
9063
9064   // Try to use byte rotation instructions.
9065   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9066   if (Subtarget.hasSSSE3())
9067     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9068             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9069       return Rotate;
9070
9071   // If we have direct support for blends, we should lower by decomposing into
9072   // a permute. That will be faster than the domain cross.
9073   if (IsBlendSupported)
9074     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9075                                                       Mask, DAG);
9076
9077   // We implement this with SHUFPD which is pretty lame because it will likely
9078   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9079   // However, all the alternatives are still more cycles and newer chips don't
9080   // have this problem. It would be really nice if x86 had better shuffles here.
9081   V1 = DAG.getBitcast(MVT::v2f64, V1);
9082   V2 = DAG.getBitcast(MVT::v2f64, V2);
9083   return DAG.getBitcast(MVT::v2i64,
9084                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9085 }
9086
9087 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
9088 ///
9089 /// This is used to disable more specialized lowerings when the shufps lowering
9090 /// will happen to be efficient.
9091 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
9092   // This routine only handles 128-bit shufps.
9093   assert(Mask.size() == 4 && "Unsupported mask size!");
9094   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
9095   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
9096   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
9097   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
9098
9099   // To lower with a single SHUFPS we need to have the low half and high half
9100   // each requiring a single input.
9101   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
9102     return false;
9103   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
9104     return false;
9105
9106   return true;
9107 }
9108
9109 /// \brief Lower a vector shuffle using the SHUFPS instruction.
9110 ///
9111 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
9112 /// It makes no assumptions about whether this is the *best* lowering, it simply
9113 /// uses it.
9114 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
9115                                             ArrayRef<int> Mask, SDValue V1,
9116                                             SDValue V2, SelectionDAG &DAG) {
9117   SDValue LowV = V1, HighV = V2;
9118   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
9119
9120   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9121
9122   if (NumV2Elements == 1) {
9123     int V2Index =
9124         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
9125         Mask.begin();
9126
9127     // Compute the index adjacent to V2Index and in the same half by toggling
9128     // the low bit.
9129     int V2AdjIndex = V2Index ^ 1;
9130
9131     if (Mask[V2AdjIndex] < 0) {
9132       // Handles all the cases where we have a single V2 element and an undef.
9133       // This will only ever happen in the high lanes because we commute the
9134       // vector otherwise.
9135       if (V2Index < 2)
9136         std::swap(LowV, HighV);
9137       NewMask[V2Index] -= 4;
9138     } else {
9139       // Handle the case where the V2 element ends up adjacent to a V1 element.
9140       // To make this work, blend them together as the first step.
9141       int V1Index = V2AdjIndex;
9142       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
9143       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9144                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9145
9146       // Now proceed to reconstruct the final blend as we have the necessary
9147       // high or low half formed.
9148       if (V2Index < 2) {
9149         LowV = V2;
9150         HighV = V1;
9151       } else {
9152         HighV = V2;
9153       }
9154       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
9155       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
9156     }
9157   } else if (NumV2Elements == 2) {
9158     if (Mask[0] < 4 && Mask[1] < 4) {
9159       // Handle the easy case where we have V1 in the low lanes and V2 in the
9160       // high lanes.
9161       NewMask[2] -= 4;
9162       NewMask[3] -= 4;
9163     } else if (Mask[2] < 4 && Mask[3] < 4) {
9164       // We also handle the reversed case because this utility may get called
9165       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
9166       // arrange things in the right direction.
9167       NewMask[0] -= 4;
9168       NewMask[1] -= 4;
9169       HighV = V1;
9170       LowV = V2;
9171     } else {
9172       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
9173       // trying to place elements directly, just blend them and set up the final
9174       // shuffle to place them.
9175
9176       // The first two blend mask elements are for V1, the second two are for
9177       // V2.
9178       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
9179                           Mask[2] < 4 ? Mask[2] : Mask[3],
9180                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
9181                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
9182       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
9183                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9184
9185       // Now we do a normal shuffle of V1 by giving V1 as both operands to
9186       // a blend.
9187       LowV = HighV = V1;
9188       NewMask[0] = Mask[0] < 4 ? 0 : 2;
9189       NewMask[1] = Mask[0] < 4 ? 2 : 0;
9190       NewMask[2] = Mask[2] < 4 ? 1 : 3;
9191       NewMask[3] = Mask[2] < 4 ? 3 : 1;
9192     }
9193   }
9194   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
9195                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
9196 }
9197
9198 /// \brief Lower 4-lane 32-bit floating point shuffles.
9199 ///
9200 /// Uses instructions exclusively from the floating point unit to minimize
9201 /// domain crossing penalties, as these are sufficient to implement all v4f32
9202 /// shuffles.
9203 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9204                                        SDValue V1, SDValue V2,
9205                                        const X86Subtarget &Subtarget,
9206                                        SelectionDAG &DAG) {
9207   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9208   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9209   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9210
9211   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9212
9213   if (NumV2Elements == 0) {
9214     // Check for being able to broadcast a single element.
9215     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9216             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
9217       return Broadcast;
9218
9219     // Use even/odd duplicate instructions for masks that match their pattern.
9220     if (Subtarget.hasSSE3()) {
9221       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
9222         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
9223       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
9224         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
9225     }
9226
9227     if (Subtarget.hasAVX()) {
9228       // If we have AVX, we can use VPERMILPS which will allow folding a load
9229       // into the shuffle.
9230       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
9231                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9232     }
9233
9234     // Otherwise, use a straight shuffle of a single input vector. We pass the
9235     // input vector to both operands to simulate this with a SHUFPS.
9236     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
9237                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9238   }
9239
9240   // There are special ways we can lower some single-element blends. However, we
9241   // have custom ways we can lower more complex single-element blends below that
9242   // we defer to if both this and BLENDPS fail to match, so restrict this to
9243   // when the V2 input is targeting element 0 of the mask -- that is the fast
9244   // case here.
9245   if (NumV2Elements == 1 && Mask[0] >= 4)
9246     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
9247                                                          Mask, Subtarget, DAG))
9248       return V;
9249
9250   if (Subtarget.hasSSE41()) {
9251     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
9252                                                   Subtarget, DAG))
9253       return Blend;
9254
9255     // Use INSERTPS if we can complete the shuffle efficiently.
9256     if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
9257       return V;
9258
9259     if (!isSingleSHUFPSMask(Mask))
9260       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
9261               DL, MVT::v4f32, V1, V2, Mask, DAG))
9262         return BlendPerm;
9263   }
9264
9265   // Use low/high mov instructions.
9266   if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
9267     return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
9268   if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
9269     return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
9270
9271   // Use dedicated unpack instructions for masks that match their pattern.
9272   if (SDValue V =
9273           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
9274     return V;
9275
9276   // Otherwise fall back to a SHUFPS lowering strategy.
9277   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
9278 }
9279
9280 /// \brief Lower 4-lane i32 vector shuffles.
9281 ///
9282 /// We try to handle these with integer-domain shuffles where we can, but for
9283 /// blends we use the floating point domain blend instructions.
9284 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9285                                        SDValue V1, SDValue V2,
9286                                        const X86Subtarget &Subtarget,
9287                                        SelectionDAG &DAG) {
9288   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9289   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9290   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9291
9292   // Whenever we can lower this as a zext, that instruction is strictly faster
9293   // than any alternative. It also allows us to fold memory operands into the
9294   // shuffle in many cases.
9295   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
9296                                                          Mask, Subtarget, DAG))
9297     return ZExt;
9298
9299   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9300
9301   if (NumV2Elements == 0) {
9302     // Check for being able to broadcast a single element.
9303     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9304             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9305       return Broadcast;
9306
9307     // Straight shuffle of a single input vector. For everything from SSE2
9308     // onward this has a single fast instruction with no scary immediates.
9309     // We coerce the shuffle pattern to be compatible with UNPCK instructions
9310     // but we aren't actually going to use the UNPCK instruction because doing
9311     // so prevents folding a load into this instruction or making a copy.
9312     const int UnpackLoMask[] = {0, 0, 1, 1};
9313     const int UnpackHiMask[] = {2, 2, 3, 3};
9314     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
9315       Mask = UnpackLoMask;
9316     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
9317       Mask = UnpackHiMask;
9318
9319     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9320                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9321   }
9322
9323   // Try to use shift instructions.
9324   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
9325                                                 Subtarget, DAG))
9326     return Shift;
9327
9328   // There are special ways we can lower some single-element blends.
9329   if (NumV2Elements == 1)
9330     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
9331                                                          Mask, Subtarget, DAG))
9332       return V;
9333
9334   // We have different paths for blend lowering, but they all must use the
9335   // *exact* same predicate.
9336   bool IsBlendSupported = Subtarget.hasSSE41();
9337   if (IsBlendSupported)
9338     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
9339                                                   Subtarget, DAG))
9340       return Blend;
9341
9342   if (SDValue Masked =
9343           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
9344     return Masked;
9345
9346   // Use dedicated unpack instructions for masks that match their pattern.
9347   if (SDValue V =
9348           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
9349     return V;
9350
9351   // Try to use byte rotation instructions.
9352   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9353   if (Subtarget.hasSSSE3())
9354     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9355             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9356       return Rotate;
9357
9358   // If we have direct support for blends, we should lower by decomposing into
9359   // a permute. That will be faster than the domain cross.
9360   if (IsBlendSupported)
9361     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
9362                                                       Mask, DAG);
9363
9364   // Try to lower by permuting the inputs into an unpack instruction.
9365   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
9366                                                             V2, Mask, DAG))
9367     return Unpack;
9368
9369   // We implement this with SHUFPS because it can blend from two vectors.
9370   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
9371   // up the inputs, bypassing domain shift penalties that we would encur if we
9372   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
9373   // relevant.
9374   return DAG.getBitcast(
9375       MVT::v4i32,
9376       DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
9377                            DAG.getBitcast(MVT::v4f32, V2), Mask));
9378 }
9379
9380 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
9381 /// shuffle lowering, and the most complex part.
9382 ///
9383 /// The lowering strategy is to try to form pairs of input lanes which are
9384 /// targeted at the same half of the final vector, and then use a dword shuffle
9385 /// to place them onto the right half, and finally unpack the paired lanes into
9386 /// their final position.
9387 ///
9388 /// The exact breakdown of how to form these dword pairs and align them on the
9389 /// correct sides is really tricky. See the comments within the function for
9390 /// more of the details.
9391 ///
9392 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
9393 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
9394 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
9395 /// vector, form the analogous 128-bit 8-element Mask.
9396 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
9397     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
9398     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9399   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
9400   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
9401
9402   assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
9403   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
9404   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
9405
9406   SmallVector<int, 4> LoInputs;
9407   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
9408                [](int M) { return M >= 0; });
9409   std::sort(LoInputs.begin(), LoInputs.end());
9410   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
9411   SmallVector<int, 4> HiInputs;
9412   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
9413                [](int M) { return M >= 0; });
9414   std::sort(HiInputs.begin(), HiInputs.end());
9415   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
9416   int NumLToL =
9417       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
9418   int NumHToL = LoInputs.size() - NumLToL;
9419   int NumLToH =
9420       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
9421   int NumHToH = HiInputs.size() - NumLToH;
9422   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
9423   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
9424   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
9425   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
9426
9427   // If we are splatting two values from one half - one to each half, then
9428   // we can shuffle that half so each is splatted to a dword, then splat those
9429   // to their respective halves.
9430   auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
9431                         int DOffset) {
9432     int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
9433     int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
9434     V = DAG.getNode(ShufWOp, DL, VT, V,
9435                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9436     V = DAG.getBitcast(PSHUFDVT, V);
9437     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
9438                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9439     return DAG.getBitcast(VT, V);
9440   };
9441
9442   if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
9443     return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
9444   if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
9445     return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
9446
9447   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
9448   // such inputs we can swap two of the dwords across the half mark and end up
9449   // with <=2 inputs to each half in each half. Once there, we can fall through
9450   // to the generic code below. For example:
9451   //
9452   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9453   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
9454   //
9455   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
9456   // and an existing 2-into-2 on the other half. In this case we may have to
9457   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
9458   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
9459   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
9460   // because any other situation (including a 3-into-1 or 1-into-3 in the other
9461   // half than the one we target for fixing) will be fixed when we re-enter this
9462   // path. We will also combine away any sequence of PSHUFD instructions that
9463   // result into a single instruction. Here is an example of the tricky case:
9464   //
9465   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9466   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
9467   //
9468   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
9469   //
9470   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
9471   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
9472   //
9473   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
9474   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
9475   //
9476   // The result is fine to be handled by the generic logic.
9477   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
9478                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
9479                           int AOffset, int BOffset) {
9480     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
9481            "Must call this with A having 3 or 1 inputs from the A half.");
9482     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
9483            "Must call this with B having 1 or 3 inputs from the B half.");
9484     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
9485            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
9486
9487     bool ThreeAInputs = AToAInputs.size() == 3;
9488
9489     // Compute the index of dword with only one word among the three inputs in
9490     // a half by taking the sum of the half with three inputs and subtracting
9491     // the sum of the actual three inputs. The difference is the remaining
9492     // slot.
9493     int ADWord, BDWord;
9494     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
9495     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
9496     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
9497     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
9498     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
9499     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
9500     int TripleNonInputIdx =
9501         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
9502     TripleDWord = TripleNonInputIdx / 2;
9503
9504     // We use xor with one to compute the adjacent DWord to whichever one the
9505     // OneInput is in.
9506     OneInputDWord = (OneInput / 2) ^ 1;
9507
9508     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
9509     // and BToA inputs. If there is also such a problem with the BToB and AToB
9510     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
9511     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
9512     // is essential that we don't *create* a 3<-1 as then we might oscillate.
9513     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
9514       // Compute how many inputs will be flipped by swapping these DWords. We
9515       // need
9516       // to balance this to ensure we don't form a 3-1 shuffle in the other
9517       // half.
9518       int NumFlippedAToBInputs =
9519           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
9520           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
9521       int NumFlippedBToBInputs =
9522           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
9523           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
9524       if ((NumFlippedAToBInputs == 1 &&
9525            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
9526           (NumFlippedBToBInputs == 1 &&
9527            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
9528         // We choose whether to fix the A half or B half based on whether that
9529         // half has zero flipped inputs. At zero, we may not be able to fix it
9530         // with that half. We also bias towards fixing the B half because that
9531         // will more commonly be the high half, and we have to bias one way.
9532         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
9533                                                        ArrayRef<int> Inputs) {
9534           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
9535           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
9536                                          PinnedIdx ^ 1) != Inputs.end();
9537           // Determine whether the free index is in the flipped dword or the
9538           // unflipped dword based on where the pinned index is. We use this bit
9539           // in an xor to conditionally select the adjacent dword.
9540           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
9541           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9542                                              FixFreeIdx) != Inputs.end();
9543           if (IsFixIdxInput == IsFixFreeIdxInput)
9544             FixFreeIdx += 1;
9545           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9546                                         FixFreeIdx) != Inputs.end();
9547           assert(IsFixIdxInput != IsFixFreeIdxInput &&
9548                  "We need to be changing the number of flipped inputs!");
9549           int PSHUFHalfMask[] = {0, 1, 2, 3};
9550           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
9551           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
9552                           MVT::v8i16, V,
9553                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9554
9555           for (int &M : Mask)
9556             if (M >= 0 && M == FixIdx)
9557               M = FixFreeIdx;
9558             else if (M >= 0 && M == FixFreeIdx)
9559               M = FixIdx;
9560         };
9561         if (NumFlippedBToBInputs != 0) {
9562           int BPinnedIdx =
9563               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9564           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9565         } else {
9566           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9567           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
9568           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9569         }
9570       }
9571     }
9572
9573     int PSHUFDMask[] = {0, 1, 2, 3};
9574     PSHUFDMask[ADWord] = BDWord;
9575     PSHUFDMask[BDWord] = ADWord;
9576     V = DAG.getBitcast(
9577         VT,
9578         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9579                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9580
9581     // Adjust the mask to match the new locations of A and B.
9582     for (int &M : Mask)
9583       if (M >= 0 && M/2 == ADWord)
9584         M = 2 * BDWord + M % 2;
9585       else if (M >= 0 && M/2 == BDWord)
9586         M = 2 * ADWord + M % 2;
9587
9588     // Recurse back into this routine to re-compute state now that this isn't
9589     // a 3 and 1 problem.
9590     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
9591                                                      DAG);
9592   };
9593   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9594     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9595   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9596     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9597
9598   // At this point there are at most two inputs to the low and high halves from
9599   // each half. That means the inputs can always be grouped into dwords and
9600   // those dwords can then be moved to the correct half with a dword shuffle.
9601   // We use at most one low and one high word shuffle to collect these paired
9602   // inputs into dwords, and finally a dword shuffle to place them.
9603   int PSHUFLMask[4] = {-1, -1, -1, -1};
9604   int PSHUFHMask[4] = {-1, -1, -1, -1};
9605   int PSHUFDMask[4] = {-1, -1, -1, -1};
9606
9607   // First fix the masks for all the inputs that are staying in their
9608   // original halves. This will then dictate the targets of the cross-half
9609   // shuffles.
9610   auto fixInPlaceInputs =
9611       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9612                     MutableArrayRef<int> SourceHalfMask,
9613                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9614     if (InPlaceInputs.empty())
9615       return;
9616     if (InPlaceInputs.size() == 1) {
9617       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9618           InPlaceInputs[0] - HalfOffset;
9619       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9620       return;
9621     }
9622     if (IncomingInputs.empty()) {
9623       // Just fix all of the in place inputs.
9624       for (int Input : InPlaceInputs) {
9625         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9626         PSHUFDMask[Input / 2] = Input / 2;
9627       }
9628       return;
9629     }
9630
9631     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9632     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9633         InPlaceInputs[0] - HalfOffset;
9634     // Put the second input next to the first so that they are packed into
9635     // a dword. We find the adjacent index by toggling the low bit.
9636     int AdjIndex = InPlaceInputs[0] ^ 1;
9637     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9638     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9639     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9640   };
9641   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9642   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9643
9644   // Now gather the cross-half inputs and place them into a free dword of
9645   // their target half.
9646   // FIXME: This operation could almost certainly be simplified dramatically to
9647   // look more like the 3-1 fixing operation.
9648   auto moveInputsToRightHalf = [&PSHUFDMask](
9649       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9650       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9651       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9652       int DestOffset) {
9653     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9654       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
9655     };
9656     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9657                                                int Word) {
9658       int LowWord = Word & ~1;
9659       int HighWord = Word | 1;
9660       return isWordClobbered(SourceHalfMask, LowWord) ||
9661              isWordClobbered(SourceHalfMask, HighWord);
9662     };
9663
9664     if (IncomingInputs.empty())
9665       return;
9666
9667     if (ExistingInputs.empty()) {
9668       // Map any dwords with inputs from them into the right half.
9669       for (int Input : IncomingInputs) {
9670         // If the source half mask maps over the inputs, turn those into
9671         // swaps and use the swapped lane.
9672         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9673           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
9674             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9675                 Input - SourceOffset;
9676             // We have to swap the uses in our half mask in one sweep.
9677             for (int &M : HalfMask)
9678               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9679                 M = Input;
9680               else if (M == Input)
9681                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9682           } else {
9683             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9684                        Input - SourceOffset &&
9685                    "Previous placement doesn't match!");
9686           }
9687           // Note that this correctly re-maps both when we do a swap and when
9688           // we observe the other side of the swap above. We rely on that to
9689           // avoid swapping the members of the input list directly.
9690           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9691         }
9692
9693         // Map the input's dword into the correct half.
9694         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
9695           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9696         else
9697           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9698                      Input / 2 &&
9699                  "Previous placement doesn't match!");
9700       }
9701
9702       // And just directly shift any other-half mask elements to be same-half
9703       // as we will have mirrored the dword containing the element into the
9704       // same position within that half.
9705       for (int &M : HalfMask)
9706         if (M >= SourceOffset && M < SourceOffset + 4) {
9707           M = M - SourceOffset + DestOffset;
9708           assert(M >= 0 && "This should never wrap below zero!");
9709         }
9710       return;
9711     }
9712
9713     // Ensure we have the input in a viable dword of its current half. This
9714     // is particularly tricky because the original position may be clobbered
9715     // by inputs being moved and *staying* in that half.
9716     if (IncomingInputs.size() == 1) {
9717       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9718         int InputFixed = std::find(std::begin(SourceHalfMask),
9719                                    std::end(SourceHalfMask), -1) -
9720                          std::begin(SourceHalfMask) + SourceOffset;
9721         SourceHalfMask[InputFixed - SourceOffset] =
9722             IncomingInputs[0] - SourceOffset;
9723         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9724                      InputFixed);
9725         IncomingInputs[0] = InputFixed;
9726       }
9727     } else if (IncomingInputs.size() == 2) {
9728       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9729           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9730         // We have two non-adjacent or clobbered inputs we need to extract from
9731         // the source half. To do this, we need to map them into some adjacent
9732         // dword slot in the source mask.
9733         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9734                               IncomingInputs[1] - SourceOffset};
9735
9736         // If there is a free slot in the source half mask adjacent to one of
9737         // the inputs, place the other input in it. We use (Index XOR 1) to
9738         // compute an adjacent index.
9739         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9740             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
9741           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9742           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9743           InputsFixed[1] = InputsFixed[0] ^ 1;
9744         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9745                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
9746           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9747           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9748           InputsFixed[0] = InputsFixed[1] ^ 1;
9749         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
9750                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
9751           // The two inputs are in the same DWord but it is clobbered and the
9752           // adjacent DWord isn't used at all. Move both inputs to the free
9753           // slot.
9754           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9755           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9756           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9757           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9758         } else {
9759           // The only way we hit this point is if there is no clobbering
9760           // (because there are no off-half inputs to this half) and there is no
9761           // free slot adjacent to one of the inputs. In this case, we have to
9762           // swap an input with a non-input.
9763           for (int i = 0; i < 4; ++i)
9764             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
9765                    "We can't handle any clobbers here!");
9766           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9767                  "Cannot have adjacent inputs here!");
9768
9769           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9770           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9771
9772           // We also have to update the final source mask in this case because
9773           // it may need to undo the above swap.
9774           for (int &M : FinalSourceHalfMask)
9775             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9776               M = InputsFixed[1] + SourceOffset;
9777             else if (M == InputsFixed[1] + SourceOffset)
9778               M = (InputsFixed[0] ^ 1) + SourceOffset;
9779
9780           InputsFixed[1] = InputsFixed[0] ^ 1;
9781         }
9782
9783         // Point everything at the fixed inputs.
9784         for (int &M : HalfMask)
9785           if (M == IncomingInputs[0])
9786             M = InputsFixed[0] + SourceOffset;
9787           else if (M == IncomingInputs[1])
9788             M = InputsFixed[1] + SourceOffset;
9789
9790         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9791         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9792       }
9793     } else {
9794       llvm_unreachable("Unhandled input size!");
9795     }
9796
9797     // Now hoist the DWord down to the right half.
9798     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
9799     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
9800     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9801     for (int &M : HalfMask)
9802       for (int Input : IncomingInputs)
9803         if (M == Input)
9804           M = FreeDWord * 2 + Input % 2;
9805   };
9806   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9807                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9808   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9809                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9810
9811   // Now enact all the shuffles we've computed to move the inputs into their
9812   // target half.
9813   if (!isNoopShuffleMask(PSHUFLMask))
9814     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9815                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
9816   if (!isNoopShuffleMask(PSHUFHMask))
9817     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9818                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
9819   if (!isNoopShuffleMask(PSHUFDMask))
9820     V = DAG.getBitcast(
9821         VT,
9822         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9823                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9824
9825   // At this point, each half should contain all its inputs, and we can then
9826   // just shuffle them into their final position.
9827   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
9828          "Failed to lift all the high half inputs to the low mask!");
9829   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
9830          "Failed to lift all the low half inputs to the high mask!");
9831
9832   // Do a half shuffle for the low mask.
9833   if (!isNoopShuffleMask(LoMask))
9834     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9835                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
9836
9837   // Do a half shuffle with the high mask after shifting its values down.
9838   for (int &M : HiMask)
9839     if (M >= 0)
9840       M -= 4;
9841   if (!isNoopShuffleMask(HiMask))
9842     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9843                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
9844
9845   return V;
9846 }
9847
9848 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
9849 /// blend if only one input is used.
9850 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
9851     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9852     SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
9853   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9854   SDValue V1Mask[16];
9855   SDValue V2Mask[16];
9856   V1InUse = false;
9857   V2InUse = false;
9858
9859   int Size = Mask.size();
9860   int Scale = 16 / Size;
9861   for (int i = 0; i < 16; ++i) {
9862     if (Mask[i / Scale] < 0) {
9863       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9864     } else {
9865       const int ZeroMask = 0x80;
9866       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
9867                                           : ZeroMask;
9868       int V2Idx = Mask[i / Scale] < Size
9869                       ? ZeroMask
9870                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
9871       if (Zeroable[i / Scale])
9872         V1Idx = V2Idx = ZeroMask;
9873       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
9874       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
9875       V1InUse |= (ZeroMask != V1Idx);
9876       V2InUse |= (ZeroMask != V2Idx);
9877     }
9878   }
9879
9880   if (V1InUse)
9881     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9882                      DAG.getBitcast(MVT::v16i8, V1),
9883                      DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
9884   if (V2InUse)
9885     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9886                      DAG.getBitcast(MVT::v16i8, V2),
9887                      DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
9888
9889   // If we need shuffled inputs from both, blend the two.
9890   SDValue V;
9891   if (V1InUse && V2InUse)
9892     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9893   else
9894     V = V1InUse ? V1 : V2;
9895
9896   // Cast the result back to the correct type.
9897   return DAG.getBitcast(VT, V);
9898 }
9899
9900 /// \brief Generic lowering of 8-lane i16 shuffles.
9901 ///
9902 /// This handles both single-input shuffles and combined shuffle/blends with
9903 /// two inputs. The single input shuffles are immediately delegated to
9904 /// a dedicated lowering routine.
9905 ///
9906 /// The blends are lowered in one of three fundamental ways. If there are few
9907 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9908 /// of the input is significantly cheaper when lowered as an interleaving of
9909 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9910 /// halves of the inputs separately (making them have relatively few inputs)
9911 /// and then concatenate them.
9912 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9913                                        SDValue V1, SDValue V2,
9914                                        const X86Subtarget &Subtarget,
9915                                        SelectionDAG &DAG) {
9916   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9917   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9918   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9919
9920   // Whenever we can lower this as a zext, that instruction is strictly faster
9921   // than any alternative.
9922   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9923           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9924     return ZExt;
9925
9926   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
9927
9928   if (NumV2Inputs == 0) {
9929     // Check for being able to broadcast a single element.
9930     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9931             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9932       return Broadcast;
9933
9934     // Try to use shift instructions.
9935     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
9936                                                   Subtarget, DAG))
9937       return Shift;
9938
9939     // Use dedicated unpack instructions for masks that match their pattern.
9940     if (SDValue V =
9941             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9942       return V;
9943
9944     // Try to use byte rotation instructions.
9945     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
9946                                                         Mask, Subtarget, DAG))
9947       return Rotate;
9948
9949     // Make a copy of the mask so it can be modified.
9950     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
9951     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
9952                                                      MutableMask, Subtarget,
9953                                                      DAG);
9954   }
9955
9956   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
9957          "All single-input shuffles should be canonicalized to be V1-input "
9958          "shuffles.");
9959
9960   // Try to use shift instructions.
9961   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
9962                                                 Subtarget, DAG))
9963     return Shift;
9964
9965   // See if we can use SSE4A Extraction / Insertion.
9966   if (Subtarget.hasSSE4A())
9967     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
9968       return V;
9969
9970   // There are special ways we can lower some single-element blends.
9971   if (NumV2Inputs == 1)
9972     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
9973                                                          Mask, Subtarget, DAG))
9974       return V;
9975
9976   // We have different paths for blend lowering, but they all must use the
9977   // *exact* same predicate.
9978   bool IsBlendSupported = Subtarget.hasSSE41();
9979   if (IsBlendSupported)
9980     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9981                                                   Subtarget, DAG))
9982       return Blend;
9983
9984   if (SDValue Masked =
9985           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9986     return Masked;
9987
9988   // Use dedicated unpack instructions for masks that match their pattern.
9989   if (SDValue V =
9990           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9991     return V;
9992
9993   // Try to use byte rotation instructions.
9994   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9995           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9996     return Rotate;
9997
9998   if (SDValue BitBlend =
9999           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10000     return BitBlend;
10001
10002   // Try to lower by permuting the inputs into an unpack instruction.
10003   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10004                                                             V2, Mask, DAG))
10005     return Unpack;
10006
10007   // If we can't directly blend but can use PSHUFB, that will be better as it
10008   // can both shuffle and set up the inefficient blend.
10009   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10010     bool V1InUse, V2InUse;
10011     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
10012                                               V1InUse, V2InUse);
10013   }
10014
10015   // We can always bit-blend if we have to so the fallback strategy is to
10016   // decompose into single-input permutes and blends.
10017   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10018                                                       Mask, DAG);
10019 }
10020
10021 /// \brief Check whether a compaction lowering can be done by dropping even
10022 /// elements and compute how many times even elements must be dropped.
10023 ///
10024 /// This handles shuffles which take every Nth element where N is a power of
10025 /// two. Example shuffle masks:
10026 ///
10027 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
10028 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10029 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
10030 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
10031 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
10032 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
10033 ///
10034 /// Any of these lanes can of course be undef.
10035 ///
10036 /// This routine only supports N <= 3.
10037 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10038 /// for larger N.
10039 ///
10040 /// \returns N above, or the number of times even elements must be dropped if
10041 /// there is such a number. Otherwise returns zero.
10042 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10043                                           bool IsSingleInput) {
10044   // The modulus for the shuffle vector entries is based on whether this is
10045   // a single input or not.
10046   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10047   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10048          "We should only be called with masks with a power-of-2 size!");
10049
10050   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10051
10052   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10053   // and 2^3 simultaneously. This is because we may have ambiguity with
10054   // partially undef inputs.
10055   bool ViableForN[3] = {true, true, true};
10056
10057   for (int i = 0, e = Mask.size(); i < e; ++i) {
10058     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10059     // want.
10060     if (Mask[i] < 0)
10061       continue;
10062
10063     bool IsAnyViable = false;
10064     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10065       if (ViableForN[j]) {
10066         uint64_t N = j + 1;
10067
10068         // The shuffle mask must be equal to (i * 2^N) % M.
10069         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10070           IsAnyViable = true;
10071         else
10072           ViableForN[j] = false;
10073       }
10074     // Early exit if we exhaust the possible powers of two.
10075     if (!IsAnyViable)
10076       break;
10077   }
10078
10079   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10080     if (ViableForN[j])
10081       return j + 1;
10082
10083   // Return 0 as there is no viable power of two.
10084   return 0;
10085 }
10086
10087 /// \brief Generic lowering of v16i8 shuffles.
10088 ///
10089 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
10090 /// detect any complexity reducing interleaving. If that doesn't help, it uses
10091 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
10092 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
10093 /// back together.
10094 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10095                                        SDValue V1, SDValue V2,
10096                                        const X86Subtarget &Subtarget,
10097                                        SelectionDAG &DAG) {
10098   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10099   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10100   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10101
10102   // Try to use shift instructions.
10103   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
10104                                                 Subtarget, DAG))
10105     return Shift;
10106
10107   // Try to use byte rotation instructions.
10108   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10109           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10110     return Rotate;
10111
10112   // Try to use a zext lowering.
10113   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10114           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10115     return ZExt;
10116
10117   // See if we can use SSE4A Extraction / Insertion.
10118   if (Subtarget.hasSSE4A())
10119     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
10120       return V;
10121
10122   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
10123
10124   // For single-input shuffles, there are some nicer lowering tricks we can use.
10125   if (NumV2Elements == 0) {
10126     // Check for being able to broadcast a single element.
10127     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10128             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10129       return Broadcast;
10130
10131     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
10132     // Notably, this handles splat and partial-splat shuffles more efficiently.
10133     // However, it only makes sense if the pre-duplication shuffle simplifies
10134     // things significantly. Currently, this means we need to be able to
10135     // express the pre-duplication shuffle as an i16 shuffle.
10136     //
10137     // FIXME: We should check for other patterns which can be widened into an
10138     // i16 shuffle as well.
10139     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
10140       for (int i = 0; i < 16; i += 2)
10141         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
10142           return false;
10143
10144       return true;
10145     };
10146     auto tryToWidenViaDuplication = [&]() -> SDValue {
10147       if (!canWidenViaDuplication(Mask))
10148         return SDValue();
10149       SmallVector<int, 4> LoInputs;
10150       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
10151                    [](int M) { return M >= 0 && M < 8; });
10152       std::sort(LoInputs.begin(), LoInputs.end());
10153       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
10154                      LoInputs.end());
10155       SmallVector<int, 4> HiInputs;
10156       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
10157                    [](int M) { return M >= 8; });
10158       std::sort(HiInputs.begin(), HiInputs.end());
10159       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
10160                      HiInputs.end());
10161
10162       bool TargetLo = LoInputs.size() >= HiInputs.size();
10163       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
10164       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
10165
10166       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
10167       SmallDenseMap<int, int, 8> LaneMap;
10168       for (int I : InPlaceInputs) {
10169         PreDupI16Shuffle[I/2] = I/2;
10170         LaneMap[I] = I;
10171       }
10172       int j = TargetLo ? 0 : 4, je = j + 4;
10173       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
10174         // Check if j is already a shuffle of this input. This happens when
10175         // there are two adjacent bytes after we move the low one.
10176         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
10177           // If we haven't yet mapped the input, search for a slot into which
10178           // we can map it.
10179           while (j < je && PreDupI16Shuffle[j] >= 0)
10180             ++j;
10181
10182           if (j == je)
10183             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
10184             return SDValue();
10185
10186           // Map this input with the i16 shuffle.
10187           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
10188         }
10189
10190         // Update the lane map based on the mapping we ended up with.
10191         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
10192       }
10193       V1 = DAG.getBitcast(
10194           MVT::v16i8,
10195           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10196                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
10197
10198       // Unpack the bytes to form the i16s that will be shuffled into place.
10199       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10200                        MVT::v16i8, V1, V1);
10201
10202       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10203       for (int i = 0; i < 16; ++i)
10204         if (Mask[i] >= 0) {
10205           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
10206           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
10207           if (PostDupI16Shuffle[i / 2] < 0)
10208             PostDupI16Shuffle[i / 2] = MappedMask;
10209           else
10210             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
10211                    "Conflicting entrties in the original shuffle!");
10212         }
10213       return DAG.getBitcast(
10214           MVT::v16i8,
10215           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10216                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
10217     };
10218     if (SDValue V = tryToWidenViaDuplication())
10219       return V;
10220   }
10221
10222   if (SDValue Masked =
10223           lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
10224     return Masked;
10225
10226   // Use dedicated unpack instructions for masks that match their pattern.
10227   if (SDValue V =
10228           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
10229     return V;
10230
10231   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
10232   // with PSHUFB. It is important to do this before we attempt to generate any
10233   // blends but after all of the single-input lowerings. If the single input
10234   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
10235   // want to preserve that and we can DAG combine any longer sequences into
10236   // a PSHUFB in the end. But once we start blending from multiple inputs,
10237   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
10238   // and there are *very* few patterns that would actually be faster than the
10239   // PSHUFB approach because of its ability to zero lanes.
10240   //
10241   // FIXME: The only exceptions to the above are blends which are exact
10242   // interleavings with direct instructions supporting them. We currently don't
10243   // handle those well here.
10244   if (Subtarget.hasSSSE3()) {
10245     bool V1InUse = false;
10246     bool V2InUse = false;
10247
10248     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
10249         DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
10250
10251     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
10252     // do so. This avoids using them to handle blends-with-zero which is
10253     // important as a single pshufb is significantly faster for that.
10254     if (V1InUse && V2InUse) {
10255       if (Subtarget.hasSSE41())
10256         if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
10257                                                       Mask, Subtarget, DAG))
10258           return Blend;
10259
10260       // We can use an unpack to do the blending rather than an or in some
10261       // cases. Even though the or may be (very minorly) more efficient, we
10262       // preference this lowering because there are common cases where part of
10263       // the complexity of the shuffles goes away when we do the final blend as
10264       // an unpack.
10265       // FIXME: It might be worth trying to detect if the unpack-feeding
10266       // shuffles will both be pshufb, in which case we shouldn't bother with
10267       // this.
10268       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10269               DL, MVT::v16i8, V1, V2, Mask, DAG))
10270         return Unpack;
10271     }
10272
10273     return PSHUFB;
10274   }
10275
10276   // There are special ways we can lower some single-element blends.
10277   if (NumV2Elements == 1)
10278     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
10279                                                          Mask, Subtarget, DAG))
10280       return V;
10281
10282   if (SDValue BitBlend =
10283           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
10284     return BitBlend;
10285
10286   // Check whether a compaction lowering can be done. This handles shuffles
10287   // which take every Nth element for some even N. See the helper function for
10288   // details.
10289   //
10290   // We special case these as they can be particularly efficiently handled with
10291   // the PACKUSB instruction on x86 and they show up in common patterns of
10292   // rearranging bytes to truncate wide elements.
10293   bool IsSingleInput = V2.isUndef();
10294   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
10295     // NumEvenDrops is the power of two stride of the elements. Another way of
10296     // thinking about it is that we need to drop the even elements this many
10297     // times to get the original input.
10298
10299     // First we need to zero all the dropped bytes.
10300     assert(NumEvenDrops <= 3 &&
10301            "No support for dropping even elements more than 3 times.");
10302     // We use the mask type to pick which bytes are preserved based on how many
10303     // elements are dropped.
10304     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
10305     SDValue ByteClearMask = DAG.getBitcast(
10306         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
10307     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
10308     if (!IsSingleInput)
10309       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
10310
10311     // Now pack things back together.
10312     V1 = DAG.getBitcast(MVT::v8i16, V1);
10313     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
10314     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
10315     for (int i = 1; i < NumEvenDrops; ++i) {
10316       Result = DAG.getBitcast(MVT::v8i16, Result);
10317       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
10318     }
10319
10320     return Result;
10321   }
10322
10323   // Handle multi-input cases by blending single-input shuffles.
10324   if (NumV2Elements > 0)
10325     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
10326                                                       Mask, DAG);
10327
10328   // The fallback path for single-input shuffles widens this into two v8i16
10329   // vectors with unpacks, shuffles those, and then pulls them back together
10330   // with a pack.
10331   SDValue V = V1;
10332
10333   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10334   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10335   for (int i = 0; i < 16; ++i)
10336     if (Mask[i] >= 0)
10337       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
10338
10339   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
10340
10341   SDValue VLoHalf, VHiHalf;
10342   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
10343   // them out and avoid using UNPCK{L,H} to extract the elements of V as
10344   // i16s.
10345   if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
10346                    [](int M) { return M >= 0 && M % 2 == 1; }) &&
10347       std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
10348                    [](int M) { return M >= 0 && M % 2 == 1; })) {
10349     // Use a mask to drop the high bytes.
10350     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
10351     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
10352                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
10353
10354     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
10355     VHiHalf = DAG.getUNDEF(MVT::v8i16);
10356
10357     // Squash the masks to point directly into VLoHalf.
10358     for (int &M : LoBlendMask)
10359       if (M >= 0)
10360         M /= 2;
10361     for (int &M : HiBlendMask)
10362       if (M >= 0)
10363         M /= 2;
10364   } else {
10365     // Otherwise just unpack the low half of V into VLoHalf and the high half into
10366     // VHiHalf so that we can blend them as i16s.
10367     VLoHalf = DAG.getBitcast(
10368         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
10369     VHiHalf = DAG.getBitcast(
10370         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
10371   }
10372
10373   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
10374   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
10375
10376   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
10377 }
10378
10379 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
10380 ///
10381 /// This routine breaks down the specific type of 128-bit shuffle and
10382 /// dispatches to the lowering routines accordingly.
10383 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10384                                         MVT VT, SDValue V1, SDValue V2,
10385                                         const X86Subtarget &Subtarget,
10386                                         SelectionDAG &DAG) {
10387   switch (VT.SimpleTy) {
10388   case MVT::v2i64:
10389     return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10390   case MVT::v2f64:
10391     return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10392   case MVT::v4i32:
10393     return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10394   case MVT::v4f32:
10395     return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10396   case MVT::v8i16:
10397     return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10398   case MVT::v16i8:
10399     return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10400
10401   default:
10402     llvm_unreachable("Unimplemented!");
10403   }
10404 }
10405
10406 /// \brief Helper function to test whether a shuffle mask could be
10407 /// simplified by widening the elements being shuffled.
10408 ///
10409 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10410 /// leaves it in an unspecified state.
10411 ///
10412 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10413 /// shuffle masks. The latter have the special property of a '-2' representing
10414 /// a zero-ed lane of a vector.
10415 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10416                                     SmallVectorImpl<int> &WidenedMask) {
10417   WidenedMask.assign(Mask.size() / 2, 0);
10418   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10419     // If both elements are undef, its trivial.
10420     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10421       WidenedMask[i/2] = SM_SentinelUndef;
10422       continue;
10423     }
10424
10425     // Check for an undef mask and a mask value properly aligned to fit with
10426     // a pair of values. If we find such a case, use the non-undef mask's value.
10427     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10428       WidenedMask[i/2] = Mask[i + 1] / 2;
10429       continue;
10430     }
10431     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10432       WidenedMask[i/2] = Mask[i] / 2;
10433       continue;
10434     }
10435
10436     // When zeroing, we need to spread the zeroing across both lanes to widen.
10437     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10438       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10439           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10440         WidenedMask[i/2] = SM_SentinelZero;
10441         continue;
10442       }
10443       return false;
10444     }
10445
10446     // Finally check if the two mask values are adjacent and aligned with
10447     // a pair.
10448     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10449       WidenedMask[i/2] = Mask[i] / 2;
10450       continue;
10451     }
10452
10453     // Otherwise we can't safely widen the elements used in this shuffle.
10454     return false;
10455   }
10456   assert(WidenedMask.size() == Mask.size() / 2 &&
10457          "Incorrect size of mask after widening the elements!");
10458
10459   return true;
10460 }
10461
10462 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
10463 ///
10464 /// This routine just extracts two subvectors, shuffles them independently, and
10465 /// then concatenates them back together. This should work effectively with all
10466 /// AVX vector shuffle types.
10467 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10468                                           SDValue V2, ArrayRef<int> Mask,
10469                                           SelectionDAG &DAG) {
10470   assert(VT.getSizeInBits() >= 256 &&
10471          "Only for 256-bit or wider vector shuffles!");
10472   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10473   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10474
10475   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10476   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10477
10478   int NumElements = VT.getVectorNumElements();
10479   int SplitNumElements = NumElements / 2;
10480   MVT ScalarVT = VT.getVectorElementType();
10481   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10482
10483   // Rather than splitting build-vectors, just build two narrower build
10484   // vectors. This helps shuffling with splats and zeros.
10485   auto SplitVector = [&](SDValue V) {
10486     V = peekThroughBitcasts(V);
10487
10488     MVT OrigVT = V.getSimpleValueType();
10489     int OrigNumElements = OrigVT.getVectorNumElements();
10490     int OrigSplitNumElements = OrigNumElements / 2;
10491     MVT OrigScalarVT = OrigVT.getVectorElementType();
10492     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
10493
10494     SDValue LoV, HiV;
10495
10496     auto *BV = dyn_cast<BuildVectorSDNode>(V);
10497     if (!BV) {
10498       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10499                         DAG.getIntPtrConstant(0, DL));
10500       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10501                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
10502     } else {
10503
10504       SmallVector<SDValue, 16> LoOps, HiOps;
10505       for (int i = 0; i < OrigSplitNumElements; ++i) {
10506         LoOps.push_back(BV->getOperand(i));
10507         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
10508       }
10509       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
10510       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
10511     }
10512     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
10513                           DAG.getBitcast(SplitVT, HiV));
10514   };
10515
10516   SDValue LoV1, HiV1, LoV2, HiV2;
10517   std::tie(LoV1, HiV1) = SplitVector(V1);
10518   std::tie(LoV2, HiV2) = SplitVector(V2);
10519
10520   // Now create two 4-way blends of these half-width vectors.
10521   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10522     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10523     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
10524     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
10525     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
10526     for (int i = 0; i < SplitNumElements; ++i) {
10527       int M = HalfMask[i];
10528       if (M >= NumElements) {
10529         if (M >= NumElements + SplitNumElements)
10530           UseHiV2 = true;
10531         else
10532           UseLoV2 = true;
10533         V2BlendMask[i] = M - NumElements;
10534         BlendMask[i] = SplitNumElements + i;
10535       } else if (M >= 0) {
10536         if (M >= SplitNumElements)
10537           UseHiV1 = true;
10538         else
10539           UseLoV1 = true;
10540         V1BlendMask[i] = M;
10541         BlendMask[i] = i;
10542       }
10543     }
10544
10545     // Because the lowering happens after all combining takes place, we need to
10546     // manually combine these blend masks as much as possible so that we create
10547     // a minimal number of high-level vector shuffle nodes.
10548
10549     // First try just blending the halves of V1 or V2.
10550     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10551       return DAG.getUNDEF(SplitVT);
10552     if (!UseLoV2 && !UseHiV2)
10553       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10554     if (!UseLoV1 && !UseHiV1)
10555       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10556
10557     SDValue V1Blend, V2Blend;
10558     if (UseLoV1 && UseHiV1) {
10559       V1Blend =
10560         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10561     } else {
10562       // We only use half of V1 so map the usage down into the final blend mask.
10563       V1Blend = UseLoV1 ? LoV1 : HiV1;
10564       for (int i = 0; i < SplitNumElements; ++i)
10565         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10566           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10567     }
10568     if (UseLoV2 && UseHiV2) {
10569       V2Blend =
10570         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10571     } else {
10572       // We only use half of V2 so map the usage down into the final blend mask.
10573       V2Blend = UseLoV2 ? LoV2 : HiV2;
10574       for (int i = 0; i < SplitNumElements; ++i)
10575         if (BlendMask[i] >= SplitNumElements)
10576           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10577     }
10578     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10579   };
10580   SDValue Lo = HalfBlend(LoMask);
10581   SDValue Hi = HalfBlend(HiMask);
10582   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10583 }
10584
10585 /// \brief Either split a vector in halves or decompose the shuffles and the
10586 /// blend.
10587 ///
10588 /// This is provided as a good fallback for many lowerings of non-single-input
10589 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10590 /// between splitting the shuffle into 128-bit components and stitching those
10591 /// back together vs. extracting the single-input shuffles and blending those
10592 /// results.
10593 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
10594                                                 SDValue V1, SDValue V2,
10595                                                 ArrayRef<int> Mask,
10596                                                 SelectionDAG &DAG) {
10597   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
10598          "shuffles as it could then recurse on itself.");
10599   int Size = Mask.size();
10600
10601   // If this can be modeled as a broadcast of two elements followed by a blend,
10602   // prefer that lowering. This is especially important because broadcasts can
10603   // often fold with memory operands.
10604   auto DoBothBroadcast = [&] {
10605     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10606     for (int M : Mask)
10607       if (M >= Size) {
10608         if (V2BroadcastIdx < 0)
10609           V2BroadcastIdx = M - Size;
10610         else if (M - Size != V2BroadcastIdx)
10611           return false;
10612       } else if (M >= 0) {
10613         if (V1BroadcastIdx < 0)
10614           V1BroadcastIdx = M;
10615         else if (M != V1BroadcastIdx)
10616           return false;
10617       }
10618     return true;
10619   };
10620   if (DoBothBroadcast())
10621     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10622                                                       DAG);
10623
10624   // If the inputs all stem from a single 128-bit lane of each input, then we
10625   // split them rather than blending because the split will decompose to
10626   // unusually few instructions.
10627   int LaneCount = VT.getSizeInBits() / 128;
10628   int LaneSize = Size / LaneCount;
10629   SmallBitVector LaneInputs[2];
10630   LaneInputs[0].resize(LaneCount, false);
10631   LaneInputs[1].resize(LaneCount, false);
10632   for (int i = 0; i < Size; ++i)
10633     if (Mask[i] >= 0)
10634       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10635   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10636     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10637
10638   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10639   // that the decomposed single-input shuffles don't end up here.
10640   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10641 }
10642
10643 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10644 /// a permutation and blend of those lanes.
10645 ///
10646 /// This essentially blends the out-of-lane inputs to each lane into the lane
10647 /// from a permuted copy of the vector. This lowering strategy results in four
10648 /// instructions in the worst case for a single-input cross lane shuffle which
10649 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10650 /// of. Special cases for each particular shuffle pattern should be handled
10651 /// prior to trying this lowering.
10652 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
10653                                                        SDValue V1, SDValue V2,
10654                                                        ArrayRef<int> Mask,
10655                                                        SelectionDAG &DAG) {
10656   // FIXME: This should probably be generalized for 512-bit vectors as well.
10657   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
10658   int Size = Mask.size();
10659   int LaneSize = Size / 2;
10660
10661   // If there are only inputs from one 128-bit lane, splitting will in fact be
10662   // less expensive. The flags track whether the given lane contains an element
10663   // that crosses to another lane.
10664   bool LaneCrossing[2] = {false, false};
10665   for (int i = 0; i < Size; ++i)
10666     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10667       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10668   if (!LaneCrossing[0] || !LaneCrossing[1])
10669     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10670
10671   assert(V2.isUndef() &&
10672          "This last part of this routine only works on single input shuffles");
10673
10674   SmallVector<int, 32> FlippedBlendMask(Size);
10675   for (int i = 0; i < Size; ++i)
10676     FlippedBlendMask[i] =
10677         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10678                                 ? Mask[i]
10679                                 : Mask[i] % LaneSize +
10680                                       (i / LaneSize) * LaneSize + Size);
10681
10682   // Flip the vector, and blend the results which should now be in-lane. The
10683   // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10684   // 5 for the high source. The value 3 selects the high half of source 2 and
10685   // the value 2 selects the low half of source 2. We only use source 2 to
10686   // allow folding it into a memory operand.
10687   unsigned PERMMask = 3 | 2 << 4;
10688   SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10689                                 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
10690   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10691 }
10692
10693 /// \brief Handle lowering 2-lane 128-bit shuffles.
10694 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10695                                         SDValue V2, ArrayRef<int> Mask,
10696                                         const X86Subtarget &Subtarget,
10697                                         SelectionDAG &DAG) {
10698   // TODO: If minimizing size and one of the inputs is a zero vector and the
10699   // the zero vector has only one use, we could use a VPERM2X128 to save the
10700   // instruction bytes needed to explicitly generate the zero vector.
10701
10702   // Blends are faster and handle all the non-lane-crossing cases.
10703   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10704                                                 Subtarget, DAG))
10705     return Blend;
10706
10707   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
10708   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
10709
10710   // If either input operand is a zero vector, use VPERM2X128 because its mask
10711   // allows us to replace the zero input with an implicit zero.
10712   if (!IsV1Zero && !IsV2Zero) {
10713     // Check for patterns which can be matched with a single insert of a 128-bit
10714     // subvector.
10715     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
10716     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
10717       // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
10718       if (Subtarget.hasAVX2() && V2.isUndef())
10719         return SDValue();
10720
10721       MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10722                                    VT.getVectorNumElements() / 2);
10723       SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10724                                 DAG.getIntPtrConstant(0, DL));
10725       SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10726                                 OnlyUsesV1 ? V1 : V2,
10727                                 DAG.getIntPtrConstant(0, DL));
10728       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10729     }
10730   }
10731
10732   // Otherwise form a 128-bit permutation. After accounting for undefs,
10733   // convert the 64-bit shuffle mask selection values into 128-bit
10734   // selection bits by dividing the indexes by 2 and shifting into positions
10735   // defined by a vperm2*128 instruction's immediate control byte.
10736
10737   // The immediate permute control byte looks like this:
10738   //    [1:0] - select 128 bits from sources for low half of destination
10739   //    [2]   - ignore
10740   //    [3]   - zero low half of destination
10741   //    [5:4] - select 128 bits from sources for high half of destination
10742   //    [6]   - ignore
10743   //    [7]   - zero high half of destination
10744
10745   int MaskLO = Mask[0];
10746   if (MaskLO == SM_SentinelUndef)
10747     MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
10748
10749   int MaskHI = Mask[2];
10750   if (MaskHI == SM_SentinelUndef)
10751     MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
10752
10753   unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
10754
10755   // If either input is a zero vector, replace it with an undef input.
10756   // Shuffle mask values <  4 are selecting elements of V1.
10757   // Shuffle mask values >= 4 are selecting elements of V2.
10758   // Adjust each half of the permute mask by clearing the half that was
10759   // selecting the zero vector and setting the zero mask bit.
10760   if (IsV1Zero) {
10761     V1 = DAG.getUNDEF(VT);
10762     if (MaskLO < 4)
10763       PermMask = (PermMask & 0xf0) | 0x08;
10764     if (MaskHI < 4)
10765       PermMask = (PermMask & 0x0f) | 0x80;
10766   }
10767   if (IsV2Zero) {
10768     V2 = DAG.getUNDEF(VT);
10769     if (MaskLO >= 4)
10770       PermMask = (PermMask & 0xf0) | 0x08;
10771     if (MaskHI >= 4)
10772       PermMask = (PermMask & 0x0f) | 0x80;
10773   }
10774
10775   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10776                      DAG.getConstant(PermMask, DL, MVT::i8));
10777 }
10778
10779 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10780 /// shuffling each lane.
10781 ///
10782 /// This will only succeed when the result of fixing the 128-bit lanes results
10783 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10784 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10785 /// the lane crosses early and then use simpler shuffles within each lane.
10786 ///
10787 /// FIXME: It might be worthwhile at some point to support this without
10788 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10789 /// in x86 only floating point has interesting non-repeating shuffles, and even
10790 /// those are still *marginally* more expensive.
10791 static SDValue lowerVectorShuffleByMerging128BitLanes(
10792     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10793     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10794   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
10795
10796   int Size = Mask.size();
10797   int LaneSize = 128 / VT.getScalarSizeInBits();
10798   int NumLanes = Size / LaneSize;
10799   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10800
10801   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10802   // check whether the in-128-bit lane shuffles share a repeating pattern.
10803   SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
10804   SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
10805   for (int i = 0; i < Size; ++i) {
10806     if (Mask[i] < 0)
10807       continue;
10808
10809     int j = i / LaneSize;
10810
10811     if (Lanes[j] < 0) {
10812       // First entry we've seen for this lane.
10813       Lanes[j] = Mask[i] / LaneSize;
10814     } else if (Lanes[j] != Mask[i] / LaneSize) {
10815       // This doesn't match the lane selected previously!
10816       return SDValue();
10817     }
10818
10819     // Check that within each lane we have a consistent shuffle mask.
10820     int k = i % LaneSize;
10821     if (InLaneMask[k] < 0) {
10822       InLaneMask[k] = Mask[i] % LaneSize;
10823     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10824       // This doesn't fit a repeating in-lane mask.
10825       return SDValue();
10826     }
10827   }
10828
10829   // First shuffle the lanes into place.
10830   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10831                                 VT.getSizeInBits() / 64);
10832   SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
10833   for (int i = 0; i < NumLanes; ++i)
10834     if (Lanes[i] >= 0) {
10835       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10836       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10837     }
10838
10839   V1 = DAG.getBitcast(LaneVT, V1);
10840   V2 = DAG.getBitcast(LaneVT, V2);
10841   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10842
10843   // Cast it back to the type we actually want.
10844   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
10845
10846   // Now do a simple shuffle that isn't lane crossing.
10847   SmallVector<int, 8> NewMask((unsigned)Size, -1);
10848   for (int i = 0; i < Size; ++i)
10849     if (Mask[i] >= 0)
10850       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10851   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10852          "Must not introduce lane crosses at this point!");
10853
10854   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10855 }
10856
10857 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
10858 /// This allows for fast cases such as subvector extraction/insertion
10859 /// or shuffling smaller vector types which can lower more efficiently.
10860 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
10861                                                SDValue V1, SDValue V2,
10862                                                ArrayRef<int> Mask,
10863                                                const X86Subtarget &Subtarget,
10864                                                SelectionDAG &DAG) {
10865   assert(VT.is256BitVector() && "Expected 256-bit vector");
10866
10867   unsigned NumElts = VT.getVectorNumElements();
10868   unsigned HalfNumElts = NumElts / 2;
10869   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
10870
10871   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
10872   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
10873   if (!UndefLower && !UndefUpper)
10874     return SDValue();
10875
10876   // Upper half is undef and lower half is whole upper subvector.
10877   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
10878   if (UndefUpper &&
10879       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
10880     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10881                              DAG.getIntPtrConstant(HalfNumElts, DL));
10882     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10883                        DAG.getIntPtrConstant(0, DL));
10884   }
10885
10886   // Lower half is undef and upper half is whole lower subvector.
10887   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
10888   if (UndefLower &&
10889       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
10890     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10891                              DAG.getIntPtrConstant(0, DL));
10892     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10893                        DAG.getIntPtrConstant(HalfNumElts, DL));
10894   }
10895
10896   // If the shuffle only uses two of the four halves of the input operands,
10897   // then extract them and perform the 'half' shuffle at half width.
10898   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
10899   int HalfIdx1 = -1, HalfIdx2 = -1;
10900   SmallVector<int, 8> HalfMask(HalfNumElts);
10901   unsigned Offset = UndefLower ? HalfNumElts : 0;
10902   for (unsigned i = 0; i != HalfNumElts; ++i) {
10903     int M = Mask[i + Offset];
10904     if (M < 0) {
10905       HalfMask[i] = M;
10906       continue;
10907     }
10908
10909     // Determine which of the 4 half vectors this element is from.
10910     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
10911     int HalfIdx = M / HalfNumElts;
10912
10913     // Determine the element index into its half vector source.
10914     int HalfElt = M % HalfNumElts;
10915
10916     // We can shuffle with up to 2 half vectors, set the new 'half'
10917     // shuffle mask accordingly.
10918     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
10919       HalfMask[i] = HalfElt;
10920       HalfIdx1 = HalfIdx;
10921       continue;
10922     }
10923     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
10924       HalfMask[i] = HalfElt + HalfNumElts;
10925       HalfIdx2 = HalfIdx;
10926       continue;
10927     }
10928
10929     // Too many half vectors referenced.
10930     return SDValue();
10931   }
10932   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
10933
10934   // Only shuffle the halves of the inputs when useful.
10935   int NumLowerHalves =
10936       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
10937   int NumUpperHalves =
10938       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
10939
10940   // uuuuXXXX - don't extract uppers just to insert again.
10941   if (UndefLower && NumUpperHalves != 0)
10942     return SDValue();
10943
10944   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
10945   if (UndefUpper && NumUpperHalves == 2)
10946     return SDValue();
10947
10948   // AVX2 - XXXXuuuu - always extract lowers.
10949   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
10950     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
10951     if (VT == MVT::v4f64 || VT == MVT::v4i64)
10952       return SDValue();
10953     // AVX2 supports variable 32-bit element cross-lane shuffles.
10954     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
10955       // XXXXuuuu - don't extract lowers and uppers.
10956       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
10957         return SDValue();
10958     }
10959   }
10960
10961   auto GetHalfVector = [&](int HalfIdx) {
10962     if (HalfIdx < 0)
10963       return DAG.getUNDEF(HalfVT);
10964     SDValue V = (HalfIdx < 2 ? V1 : V2);
10965     HalfIdx = (HalfIdx % 2) * HalfNumElts;
10966     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
10967                        DAG.getIntPtrConstant(HalfIdx, DL));
10968   };
10969
10970   SDValue Half1 = GetHalfVector(HalfIdx1);
10971   SDValue Half2 = GetHalfVector(HalfIdx2);
10972   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
10973   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
10974                      DAG.getIntPtrConstant(Offset, DL));
10975 }
10976
10977 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10978 /// given mask.
10979 ///
10980 /// This returns true if the elements from a particular input are already in the
10981 /// slot required by the given mask and require no permutation.
10982 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10983   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10984   int Size = Mask.size();
10985   for (int i = 0; i < Size; ++i)
10986     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10987       return false;
10988
10989   return true;
10990 }
10991
10992 /// Handle case where shuffle sources are coming from the same 128-bit lane and
10993 /// every lane can be represented as the same repeating mask - allowing us to
10994 /// shuffle the sources with the repeating shuffle and then permute the result
10995 /// to the destination lanes.
10996 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
10997     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10998     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10999   int NumElts = VT.getVectorNumElements();
11000   int NumLanes = VT.getSizeInBits() / 128;
11001   int NumLaneElts = NumElts / NumLanes;
11002
11003   // On AVX2 we may be able to just shuffle the lowest elements and then
11004   // broadcast the result.
11005   if (Subtarget.hasAVX2()) {
11006     for (unsigned BroadcastSize : {16, 32, 64}) {
11007       if (BroadcastSize <= VT.getScalarSizeInBits())
11008         continue;
11009       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11010
11011       // Attempt to match a repeating pattern every NumBroadcastElts,
11012       // accounting for UNDEFs but only references the lowest 128-bit
11013       // lane of the inputs.
11014       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11015         for (int i = 0; i != NumElts; i += NumBroadcastElts)
11016           for (int j = 0; j != NumBroadcastElts; ++j) {
11017             int M = Mask[i + j];
11018             if (M < 0)
11019               continue;
11020             int &R = RepeatMask[j];
11021             if (0 != ((M % NumElts) / NumLaneElts))
11022               return false;
11023             if (0 <= R && R != M)
11024               return false;
11025             R = M;
11026           }
11027         return true;
11028       };
11029
11030       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11031       if (!FindRepeatingBroadcastMask(RepeatMask))
11032         continue;
11033
11034       // Shuffle the (lowest) repeated elements in place for broadcast.
11035       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11036
11037       // Shuffle the actual broadcast.
11038       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11039       for (int i = 0; i != NumElts; i += NumBroadcastElts)
11040         for (int j = 0; j != NumBroadcastElts; ++j)
11041           BroadcastMask[i + j] = j;
11042       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11043                                   BroadcastMask);
11044     }
11045   }
11046
11047   // Bail if the shuffle mask doesn't cross 128-bit lanes.
11048   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
11049     return SDValue();
11050
11051   // Bail if we already have a repeated lane shuffle mask.
11052   SmallVector<int, 8> RepeatedShuffleMask;
11053   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11054     return SDValue();
11055
11056   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11057   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11058   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11059   int NumSubLanes = NumLanes * SubLaneScale;
11060   int NumSubLaneElts = NumLaneElts / SubLaneScale;
11061
11062   // Check that all the sources are coming from the same lane and see if we can
11063   // form a repeating shuffle mask (local to each sub-lane). At the same time,
11064   // determine the source sub-lane for each destination sub-lane.
11065   int TopSrcSubLane = -1;
11066   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11067   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
11068       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
11069       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
11070
11071   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
11072     // Extract the sub-lane mask, check that it all comes from the same lane
11073     // and normalize the mask entries to come from the first lane.
11074     int SrcLane = -1;
11075     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
11076     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11077       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
11078       if (M < 0)
11079         continue;
11080       int Lane = (M % NumElts) / NumLaneElts;
11081       if ((0 <= SrcLane) && (SrcLane != Lane))
11082         return SDValue();
11083       SrcLane = Lane;
11084       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
11085       SubLaneMask[Elt] = LocalM;
11086     }
11087
11088     // Whole sub-lane is UNDEF.
11089     if (SrcLane < 0)
11090       continue;
11091
11092     // Attempt to match against the candidate repeated sub-lane masks.
11093     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
11094       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
11095         for (int i = 0; i != NumSubLaneElts; ++i) {
11096           if (M1[i] < 0 || M2[i] < 0)
11097             continue;
11098           if (M1[i] != M2[i])
11099             return false;
11100         }
11101         return true;
11102       };
11103
11104       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
11105       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
11106         continue;
11107
11108       // Merge the sub-lane mask into the matching repeated sub-lane mask.
11109       for (int i = 0; i != NumSubLaneElts; ++i) {
11110         int M = SubLaneMask[i];
11111         if (M < 0)
11112           continue;
11113         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
11114                "Unexpected mask element");
11115         RepeatedSubLaneMask[i] = M;
11116       }
11117
11118       // Track the top most source sub-lane - by setting the remaining to UNDEF
11119       // we can greatly simplify shuffle matching.
11120       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
11121       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11122       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
11123       break;
11124     }
11125
11126     // Bail if we failed to find a matching repeated sub-lane mask.
11127     if (Dst2SrcSubLanes[DstSubLane] < 0)
11128       return SDValue();
11129   }
11130   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11131          "Unexpected source lane");
11132
11133   // Create a repeating shuffle mask for the entire vector.
11134   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11135   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
11136     int Lane = SubLane / SubLaneScale;
11137     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
11138     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11139       int M = RepeatedSubLaneMask[Elt];
11140       if (M < 0)
11141         continue;
11142       int Idx = (SubLane * NumSubLaneElts) + Elt;
11143       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
11144     }
11145   }
11146   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
11147
11148   // Shuffle each source sub-lane to its destination.
11149   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
11150   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
11151     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11152     if (SrcSubLane < 0)
11153       continue;
11154     for (int j = 0; j != NumSubLaneElts; ++j)
11155       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
11156   }
11157
11158   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
11159                               SubLaneMask);
11160 }
11161
11162 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
11163                                             ArrayRef<int> Mask, SDValue V1,
11164                                             SDValue V2, SelectionDAG &DAG) {
11165
11166   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
11167   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
11168   assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
11169   int NumElts = VT.getVectorNumElements();
11170   bool ShufpdMask = true;
11171   bool CommutableMask = true;
11172   unsigned Immediate = 0;
11173   for (int i = 0; i < NumElts; ++i) {
11174     if (Mask[i] < 0)
11175       continue;
11176     int Val = (i & 6) + NumElts * (i & 1);
11177     int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
11178     if (Mask[i] < Val ||  Mask[i] > Val + 1)
11179       ShufpdMask = false;
11180     if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
11181       CommutableMask = false;
11182     Immediate |= (Mask[i] % 2) << i;
11183   }
11184   if (ShufpdMask)
11185     return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11186                        DAG.getConstant(Immediate, DL, MVT::i8));
11187   if (CommutableMask)
11188     return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11189                        DAG.getConstant(Immediate, DL, MVT::i8));
11190   return SDValue();
11191 }
11192
11193 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
11194 ///
11195 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
11196 /// isn't available.
11197 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11198                                        SDValue V1, SDValue V2,
11199                                        const X86Subtarget &Subtarget,
11200                                        SelectionDAG &DAG) {
11201   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11202   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11203   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11204
11205   SmallVector<int, 4> WidenedMask;
11206   if (canWidenShuffleElements(Mask, WidenedMask))
11207     if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
11208                                              Subtarget, DAG))
11209       return V;
11210
11211   if (V2.isUndef()) {
11212     // Check for being able to broadcast a single element.
11213     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11214             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11215       return Broadcast;
11216
11217     // Use low duplicate instructions for masks that match their pattern.
11218     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11219       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
11220
11221     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
11222       // Non-half-crossing single input shuffles can be lowered with an
11223       // interleaved permutation.
11224       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11225                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
11226       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
11227                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11228     }
11229
11230     // With AVX2 we have direct support for this permutation.
11231     if (Subtarget.hasAVX2())
11232       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
11233                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11234
11235     // Try to create an in-lane repeating shuffle mask and then shuffle the
11236     // the results into the target lanes.
11237     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11238             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11239       return V;
11240
11241     // Otherwise, fall back.
11242     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
11243                                                    DAG);
11244   }
11245
11246   // Use dedicated unpack instructions for masks that match their pattern.
11247   if (SDValue V =
11248           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
11249     return V;
11250
11251   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
11252                                                 Subtarget, DAG))
11253     return Blend;
11254
11255   // Check if the blend happens to exactly fit that of SHUFPD.
11256   if (SDValue Op =
11257       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
11258     return Op;
11259
11260   // Try to create an in-lane repeating shuffle mask and then shuffle the
11261   // the results into the target lanes.
11262   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11263           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11264   return V;
11265
11266   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11267   // shuffle. However, if we have AVX2 and either inputs are already in place,
11268   // we will be able to shuffle even across lanes the other input in a single
11269   // instruction so skip this pattern.
11270   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11271                                 isShuffleMaskInputInPlace(1, Mask))))
11272     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11273             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11274       return Result;
11275
11276   // If we have AVX2 then we always want to lower with a blend because an v4 we
11277   // can fully permute the elements.
11278   if (Subtarget.hasAVX2())
11279     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
11280                                                       Mask, DAG);
11281
11282   // Otherwise fall back on generic lowering.
11283   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
11284 }
11285
11286 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
11287 ///
11288 /// This routine is only called when we have AVX2 and thus a reasonable
11289 /// instruction set for v4i64 shuffling..
11290 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11291                                        SDValue V1, SDValue V2,
11292                                        const X86Subtarget &Subtarget,
11293                                        SelectionDAG &DAG) {
11294   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11295   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11296   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11297   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
11298
11299   SmallVector<int, 4> WidenedMask;
11300   if (canWidenShuffleElements(Mask, WidenedMask))
11301     if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
11302                                              Subtarget, DAG))
11303       return V;
11304
11305   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
11306                                                 Subtarget, DAG))
11307     return Blend;
11308
11309   // Check for being able to broadcast a single element.
11310   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
11311                                                         Mask, Subtarget, DAG))
11312     return Broadcast;
11313
11314   if (V2.isUndef()) {
11315     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11316     // can use lower latency instructions that will operate on both lanes.
11317     SmallVector<int, 2> RepeatedMask;
11318     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
11319       SmallVector<int, 4> PSHUFDMask;
11320       scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
11321       return DAG.getBitcast(
11322           MVT::v4i64,
11323           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
11324                       DAG.getBitcast(MVT::v8i32, V1),
11325                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11326     }
11327
11328     // AVX2 provides a direct instruction for permuting a single input across
11329     // lanes.
11330     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
11331                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11332   }
11333
11334   // Try to use shift instructions.
11335   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
11336                                                 Subtarget, DAG))
11337     return Shift;
11338
11339   // Use dedicated unpack instructions for masks that match their pattern.
11340   if (SDValue V =
11341           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
11342     return V;
11343
11344   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11345   // shuffle. However, if we have AVX2 and either inputs are already in place,
11346   // we will be able to shuffle even across lanes the other input in a single
11347   // instruction so skip this pattern.
11348   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11349                                  isShuffleMaskInputInPlace(1, Mask))))
11350     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11351             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
11352       return Result;
11353
11354   // Otherwise fall back on generic blend lowering.
11355   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
11356                                                     Mask, DAG);
11357 }
11358
11359 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
11360 ///
11361 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
11362 /// isn't available.
11363 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11364                                        SDValue V1, SDValue V2,
11365                                        const X86Subtarget &Subtarget,
11366                                        SelectionDAG &DAG) {
11367   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11368   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11369   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11370
11371   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
11372                                                 Subtarget, DAG))
11373     return Blend;
11374
11375   // Check for being able to broadcast a single element.
11376   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
11377                                                         Mask, Subtarget, DAG))
11378     return Broadcast;
11379
11380   // If the shuffle mask is repeated in each 128-bit lane, we have many more
11381   // options to efficiently lower the shuffle.
11382   SmallVector<int, 4> RepeatedMask;
11383   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
11384     assert(RepeatedMask.size() == 4 &&
11385            "Repeated masks must be half the mask width!");
11386
11387     // Use even/odd duplicate instructions for masks that match their pattern.
11388     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11389       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
11390     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11391       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
11392
11393     if (V2.isUndef())
11394       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
11395                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11396
11397     // Use dedicated unpack instructions for masks that match their pattern.
11398     if (SDValue V =
11399             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
11400       return V;
11401
11402     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
11403     // have already handled any direct blends.
11404     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
11405   }
11406
11407   // Try to create an in-lane repeating shuffle mask and then shuffle the
11408   // the results into the target lanes.
11409   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11410           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11411     return V;
11412
11413   // If we have a single input shuffle with different shuffle patterns in the
11414   // two 128-bit lanes use the variable mask to VPERMILPS.
11415   if (V2.isUndef()) {
11416     SDValue VPermMask[8];
11417     for (int i = 0; i < 8; ++i)
11418       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11419                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
11420     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
11421       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
11422                          DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
11423
11424     if (Subtarget.hasAVX2())
11425       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
11426                          DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11427
11428     // Otherwise, fall back.
11429     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
11430                                                    DAG);
11431   }
11432
11433   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11434   // shuffle.
11435   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11436           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11437     return Result;
11438
11439   // If we have AVX2 then we always want to lower with a blend because at v8 we
11440   // can fully permute the elements.
11441   if (Subtarget.hasAVX2())
11442     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
11443                                                       Mask, DAG);
11444
11445   // Otherwise fall back on generic lowering.
11446   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
11447 }
11448
11449 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
11450 ///
11451 /// This routine is only called when we have AVX2 and thus a reasonable
11452 /// instruction set for v8i32 shuffling..
11453 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11454                                        SDValue V1, SDValue V2,
11455                                        const X86Subtarget &Subtarget,
11456                                        SelectionDAG &DAG) {
11457   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11458   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11459   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11460   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
11461
11462   // Whenever we can lower this as a zext, that instruction is strictly faster
11463   // than any alternative. It also allows us to fold memory operands into the
11464   // shuffle in many cases.
11465   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
11466                                                          Mask, Subtarget, DAG))
11467     return ZExt;
11468
11469   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
11470                                                 Subtarget, DAG))
11471     return Blend;
11472
11473   // Check for being able to broadcast a single element.
11474   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
11475                                                         Mask, Subtarget, DAG))
11476     return Broadcast;
11477
11478   // If the shuffle mask is repeated in each 128-bit lane we can use more
11479   // efficient instructions that mirror the shuffles across the two 128-bit
11480   // lanes.
11481   SmallVector<int, 4> RepeatedMask;
11482   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
11483     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11484     if (V2.isUndef())
11485       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
11486                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11487
11488     // Use dedicated unpack instructions for masks that match their pattern.
11489     if (SDValue V =
11490             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
11491       return V;
11492   }
11493
11494   // Try to use shift instructions.
11495   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
11496                                                 Subtarget, DAG))
11497     return Shift;
11498
11499   // Try to use byte rotation instructions.
11500   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11501           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11502     return Rotate;
11503
11504   // Try to create an in-lane repeating shuffle mask and then shuffle the
11505   // the results into the target lanes.
11506   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11507           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11508     return V;
11509
11510   // If the shuffle patterns aren't repeated but it is a single input, directly
11511   // generate a cross-lane VPERMD instruction.
11512   if (V2.isUndef()) {
11513     SDValue VPermMask[8];
11514     for (int i = 0; i < 8; ++i)
11515       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11516                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
11517     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
11518                        DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11519   }
11520
11521   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11522   // shuffle.
11523   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11524           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11525     return Result;
11526
11527   // Otherwise fall back on generic blend lowering.
11528   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
11529                                                     Mask, DAG);
11530 }
11531
11532 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
11533 ///
11534 /// This routine is only called when we have AVX2 and thus a reasonable
11535 /// instruction set for v16i16 shuffling..
11536 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11537                                         SDValue V1, SDValue V2,
11538                                         const X86Subtarget &Subtarget,
11539                                         SelectionDAG &DAG) {
11540   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11541   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11542   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11543   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
11544
11545   // Whenever we can lower this as a zext, that instruction is strictly faster
11546   // than any alternative. It also allows us to fold memory operands into the
11547   // shuffle in many cases.
11548   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
11549                                                          Mask, Subtarget, DAG))
11550     return ZExt;
11551
11552   // Check for being able to broadcast a single element.
11553   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
11554                                                         Mask, Subtarget, DAG))
11555     return Broadcast;
11556
11557   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
11558                                                 Subtarget, DAG))
11559     return Blend;
11560
11561   // Use dedicated unpack instructions for masks that match their pattern.
11562   if (SDValue V =
11563           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
11564     return V;
11565
11566   // Try to use shift instructions.
11567   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
11568                                                 Subtarget, DAG))
11569     return Shift;
11570
11571   // Try to use byte rotation instructions.
11572   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11573           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11574     return Rotate;
11575
11576   // Try to create an in-lane repeating shuffle mask and then shuffle the
11577   // the results into the target lanes.
11578   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11579           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11580     return V;
11581
11582   if (V2.isUndef()) {
11583     // There are no generalized cross-lane shuffle operations available on i16
11584     // element types.
11585     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
11586       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
11587                                                      Mask, DAG);
11588
11589     SmallVector<int, 8> RepeatedMask;
11590     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11591       // As this is a single-input shuffle, the repeated mask should be
11592       // a strictly valid v8i16 mask that we can pass through to the v8i16
11593       // lowering to handle even the v16 case.
11594       return lowerV8I16GeneralSingleInputVectorShuffle(
11595           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
11596     }
11597   }
11598
11599   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
11600                                                     V2, Subtarget, DAG))
11601     return PSHUFB;
11602
11603   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11604   // shuffle.
11605   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11606           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11607     return Result;
11608
11609   // Otherwise fall back on generic lowering.
11610   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
11611 }
11612
11613 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
11614 ///
11615 /// This routine is only called when we have AVX2 and thus a reasonable
11616 /// instruction set for v32i8 shuffling..
11617 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11618                                        SDValue V1, SDValue V2,
11619                                        const X86Subtarget &Subtarget,
11620                                        SelectionDAG &DAG) {
11621   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11622   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11623   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11624   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
11625
11626   // Whenever we can lower this as a zext, that instruction is strictly faster
11627   // than any alternative. It also allows us to fold memory operands into the
11628   // shuffle in many cases.
11629   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
11630                                                          Mask, Subtarget, DAG))
11631     return ZExt;
11632
11633   // Check for being able to broadcast a single element.
11634   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
11635                                                         Mask, Subtarget, DAG))
11636     return Broadcast;
11637
11638   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
11639                                                 Subtarget, DAG))
11640     return Blend;
11641
11642   // Use dedicated unpack instructions for masks that match their pattern.
11643   if (SDValue V =
11644           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
11645     return V;
11646
11647   // Try to use shift instructions.
11648   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
11649                                                 Subtarget, DAG))
11650     return Shift;
11651
11652   // Try to use byte rotation instructions.
11653   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11654           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11655     return Rotate;
11656
11657   // Try to create an in-lane repeating shuffle mask and then shuffle the
11658   // the results into the target lanes.
11659   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11660           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11661     return V;
11662
11663   // There are no generalized cross-lane shuffle operations available on i8
11664   // element types.
11665   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
11666     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
11667                                                    DAG);
11668
11669   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
11670                                                     V2, Subtarget, DAG))
11671     return PSHUFB;
11672
11673   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11674   // shuffle.
11675   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11676           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11677     return Result;
11678
11679   // Otherwise fall back on generic lowering.
11680   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
11681 }
11682
11683 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
11684 ///
11685 /// This routine either breaks down the specific type of a 256-bit x86 vector
11686 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
11687 /// together based on the available instructions.
11688 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11689                                         MVT VT, SDValue V1, SDValue V2,
11690                                         const X86Subtarget &Subtarget,
11691                                         SelectionDAG &DAG) {
11692   // If we have a single input to the zero element, insert that into V1 if we
11693   // can do so cheaply.
11694   int NumElts = VT.getVectorNumElements();
11695   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
11696
11697   if (NumV2Elements == 1 && Mask[0] >= NumElts)
11698     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11699                               DL, VT, V1, V2, Mask, Subtarget, DAG))
11700       return Insertion;
11701
11702   // Handle special cases where the lower or upper half is UNDEF.
11703   if (SDValue V =
11704           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
11705     return V;
11706
11707   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
11708   // can check for those subtargets here and avoid much of the subtarget
11709   // querying in the per-vector-type lowering routines. With AVX1 we have
11710   // essentially *zero* ability to manipulate a 256-bit vector with integer
11711   // types. Since we'll use floating point types there eventually, just
11712   // immediately cast everything to a float and operate entirely in that domain.
11713   if (VT.isInteger() && !Subtarget.hasAVX2()) {
11714     int ElementBits = VT.getScalarSizeInBits();
11715     if (ElementBits < 32) {
11716       // No floating point type available, if we can't use the bit operations
11717       // for masking/blending then decompose into 128-bit vectors.
11718       if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
11719         return V;
11720       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11721         return V;
11722       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11723     }
11724
11725     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
11726                                 VT.getVectorNumElements());
11727     V1 = DAG.getBitcast(FpVT, V1);
11728     V2 = DAG.getBitcast(FpVT, V2);
11729     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
11730   }
11731
11732   switch (VT.SimpleTy) {
11733   case MVT::v4f64:
11734     return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11735   case MVT::v4i64:
11736     return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11737   case MVT::v8f32:
11738     return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11739   case MVT::v8i32:
11740     return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11741   case MVT::v16i16:
11742     return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11743   case MVT::v32i8:
11744     return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11745
11746   default:
11747     llvm_unreachable("Not a valid 256-bit x86 vector type!");
11748   }
11749 }
11750
11751 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
11752 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
11753                                         ArrayRef<int> Mask, SDValue V1,
11754                                         SDValue V2, SelectionDAG &DAG) {
11755   assert(VT.getScalarSizeInBits() == 64 &&
11756          "Unexpected element type size for 128bit shuffle.");
11757
11758   // To handle 256 bit vector requires VLX and most probably
11759   // function lowerV2X128VectorShuffle() is better solution.
11760   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
11761
11762   SmallVector<int, 4> WidenedMask;
11763   if (!canWidenShuffleElements(Mask, WidenedMask))
11764     return SDValue();
11765
11766   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11767   // Insure elements came from the same Op.
11768   int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
11769   for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11770     if (WidenedMask[i] == SM_SentinelZero)
11771       return SDValue();
11772     if (WidenedMask[i] == SM_SentinelUndef)
11773       continue;
11774
11775     SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
11776     unsigned OpIndex = (i < Size/2) ? 0 : 1;
11777     if (Ops[OpIndex].isUndef())
11778       Ops[OpIndex] = Op;
11779     else if (Ops[OpIndex] != Op)
11780       return SDValue();
11781   }
11782
11783   // Form a 128-bit permutation.
11784   // Convert the 64-bit shuffle mask selection values into 128-bit selection
11785   // bits defined by a vshuf64x2 instruction's immediate control byte.
11786   unsigned PermMask = 0, Imm = 0;
11787   unsigned ControlBitsNum = WidenedMask.size() / 2;
11788
11789   for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11790     // Use first element in place of undef mask.
11791     Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
11792     PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
11793   }
11794
11795   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
11796                      DAG.getConstant(PermMask, DL, MVT::i8));
11797 }
11798
11799 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
11800                                            ArrayRef<int> Mask, SDValue V1,
11801                                            SDValue V2, SelectionDAG &DAG) {
11802
11803   assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
11804
11805   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
11806   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
11807
11808   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
11809   if (V2.isUndef())
11810     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
11811
11812   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
11813 }
11814
11815 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
11816 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11817                                        SDValue V1, SDValue V2,
11818                                        const X86Subtarget &Subtarget,
11819                                        SelectionDAG &DAG) {
11820   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11821   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11822   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11823
11824   if (V2.isUndef()) {
11825     // Use low duplicate instructions for masks that match their pattern.
11826     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
11827       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
11828
11829     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
11830       // Non-half-crossing single input shuffles can be lowered with an
11831       // interleaved permutation.
11832       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11833                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
11834                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
11835                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
11836       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
11837                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11838     }
11839
11840     SmallVector<int, 4> RepeatedMask;
11841     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
11842       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
11843                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11844   }
11845
11846   if (SDValue Shuf128 =
11847           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
11848     return Shuf128;
11849
11850   if (SDValue Unpck =
11851           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
11852     return Unpck;
11853
11854   // Check if the blend happens to exactly fit that of SHUFPD.
11855   if (SDValue Op =
11856       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
11857     return Op;
11858
11859   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
11860 }
11861
11862 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11863 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
11864                                         SDValue V1, SDValue V2,
11865                                         const X86Subtarget &Subtarget,
11866                                         SelectionDAG &DAG) {
11867   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11868   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11869   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11870
11871   // If the shuffle mask is repeated in each 128-bit lane, we have many more
11872   // options to efficiently lower the shuffle.
11873   SmallVector<int, 4> RepeatedMask;
11874   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
11875     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11876
11877     // Use even/odd duplicate instructions for masks that match their pattern.
11878     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11879       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
11880     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11881       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
11882
11883     if (V2.isUndef())
11884       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
11885                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11886
11887     // Use dedicated unpack instructions for masks that match their pattern.
11888     if (SDValue Unpck =
11889             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
11890       return Unpck;
11891
11892     // Otherwise, fall back to a SHUFPS sequence.
11893     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
11894   }
11895
11896   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
11897 }
11898
11899 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11900 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11901                                        SDValue V1, SDValue V2,
11902                                        const X86Subtarget &Subtarget,
11903                                        SelectionDAG &DAG) {
11904   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11905   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11906   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11907
11908   if (SDValue Shuf128 =
11909           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
11910     return Shuf128;
11911
11912   if (V2.isUndef()) {
11913     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11914     // can use lower latency instructions that will operate on all four
11915     // 128-bit lanes.
11916     SmallVector<int, 2> Repeated128Mask;
11917     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
11918       SmallVector<int, 4> PSHUFDMask;
11919       scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
11920       return DAG.getBitcast(
11921           MVT::v8i64,
11922           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
11923                       DAG.getBitcast(MVT::v16i32, V1),
11924                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11925     }
11926
11927     SmallVector<int, 4> Repeated256Mask;
11928     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
11929       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
11930                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
11931   }
11932
11933   // Try to use shift instructions.
11934   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
11935                                                 Subtarget, DAG))
11936     return Shift;
11937
11938   if (SDValue Unpck =
11939           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
11940     return Unpck;
11941
11942   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
11943 }
11944
11945 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11946 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11947                                         SDValue V1, SDValue V2,
11948                                         const X86Subtarget &Subtarget,
11949                                         SelectionDAG &DAG) {
11950   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11951   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11952   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11953
11954   // If the shuffle mask is repeated in each 128-bit lane we can use more
11955   // efficient instructions that mirror the shuffles across the four 128-bit
11956   // lanes.
11957   SmallVector<int, 4> RepeatedMask;
11958   if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
11959     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11960     if (V2.isUndef())
11961       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
11962                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11963
11964     // Use dedicated unpack instructions for masks that match their pattern.
11965     if (SDValue V =
11966             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
11967       return V;
11968   }
11969
11970   // Try to use shift instructions.
11971   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
11972                                                 Subtarget, DAG))
11973     return Shift;
11974
11975   // Try to use byte rotation instructions.
11976   if (Subtarget.hasBWI())
11977     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11978             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
11979       return Rotate;
11980
11981   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
11982 }
11983
11984 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11985 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11986                                         SDValue V1, SDValue V2,
11987                                         const X86Subtarget &Subtarget,
11988                                         SelectionDAG &DAG) {
11989   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11990   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11991   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11992   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11993
11994   // Use dedicated unpack instructions for masks that match their pattern.
11995   if (SDValue V =
11996           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
11997     return V;
11998
11999   // Try to use shift instructions.
12000   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
12001                                                 Subtarget, DAG))
12002     return Shift;
12003
12004   // Try to use byte rotation instructions.
12005   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12006           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
12007     return Rotate;
12008
12009   if (V2.isUndef()) {
12010     SmallVector<int, 8> RepeatedMask;
12011     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
12012       // As this is a single-input shuffle, the repeated mask should be
12013       // a strictly valid v8i16 mask that we can pass through to the v8i16
12014       // lowering to handle even the v32 case.
12015       return lowerV8I16GeneralSingleInputVectorShuffle(
12016           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
12017     }
12018   }
12019
12020   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
12021 }
12022
12023 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
12024 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12025                                        SDValue V1, SDValue V2,
12026                                        const X86Subtarget &Subtarget,
12027                                        SelectionDAG &DAG) {
12028   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12029   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12030   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
12031   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
12032
12033   // Use dedicated unpack instructions for masks that match their pattern.
12034   if (SDValue V =
12035           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
12036     return V;
12037
12038   // Try to use shift instructions.
12039   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
12040                                                 Subtarget, DAG))
12041     return Shift;
12042
12043   // Try to use byte rotation instructions.
12044   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12045           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
12046     return Rotate;
12047
12048   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
12049                                                     V2, Subtarget, DAG))
12050     return PSHUFB;
12051
12052   // FIXME: Implement direct support for this type!
12053   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
12054 }
12055
12056 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
12057 ///
12058 /// This routine either breaks down the specific type of a 512-bit x86 vector
12059 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
12060 /// together based on the available instructions.
12061 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12062                                         MVT VT, SDValue V1, SDValue V2,
12063                                         const X86Subtarget &Subtarget,
12064                                         SelectionDAG &DAG) {
12065   assert(Subtarget.hasAVX512() &&
12066          "Cannot lower 512-bit vectors w/ basic ISA!");
12067
12068   // Check for being able to broadcast a single element.
12069   if (SDValue Broadcast =
12070           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
12071     return Broadcast;
12072
12073   // Dispatch to each element type for lowering. If we don't have support for
12074   // specific element type shuffles at 512 bits, immediately split them and
12075   // lower them. Each lowering routine of a given type is allowed to assume that
12076   // the requisite ISA extensions for that element type are available.
12077   switch (VT.SimpleTy) {
12078   case MVT::v8f64:
12079     return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12080   case MVT::v16f32:
12081     return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12082   case MVT::v8i64:
12083     return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12084   case MVT::v16i32:
12085     return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12086   case MVT::v32i16:
12087     return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12088   case MVT::v64i8:
12089     return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12090
12091   default:
12092     llvm_unreachable("Not a valid 512-bit x86 vector type!");
12093   }
12094 }
12095
12096 // Lower vXi1 vector shuffles.
12097 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
12098 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
12099 // vector, shuffle and then truncate it back.
12100 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12101                                       MVT VT, SDValue V1, SDValue V2,
12102                                       const X86Subtarget &Subtarget,
12103                                       SelectionDAG &DAG) {
12104   assert(Subtarget.hasAVX512() &&
12105          "Cannot lower 512-bit vectors w/o basic ISA!");
12106   MVT ExtVT;
12107   switch (VT.SimpleTy) {
12108   default:
12109     llvm_unreachable("Expected a vector of i1 elements");
12110   case MVT::v2i1:
12111     ExtVT = MVT::v2i64;
12112     break;
12113   case MVT::v4i1:
12114     ExtVT = MVT::v4i32;
12115     break;
12116   case MVT::v8i1:
12117     ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
12118     break;
12119   case MVT::v16i1:
12120     ExtVT = MVT::v16i32;
12121     break;
12122   case MVT::v32i1:
12123     ExtVT = MVT::v32i16;
12124     break;
12125   case MVT::v64i1:
12126     ExtVT = MVT::v64i8;
12127     break;
12128   }
12129
12130   if (ISD::isBuildVectorAllZeros(V1.getNode()))
12131     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12132   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
12133     V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12134   else
12135     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
12136
12137   if (V2.isUndef())
12138     V2 = DAG.getUNDEF(ExtVT);
12139   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
12140     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12141   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
12142     V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12143   else
12144     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
12145   return DAG.getNode(ISD::TRUNCATE, DL, VT,
12146                      DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
12147 }
12148 /// \brief Top-level lowering for x86 vector shuffles.
12149 ///
12150 /// This handles decomposition, canonicalization, and lowering of all x86
12151 /// vector shuffles. Most of the specific lowering strategies are encapsulated
12152 /// above in helper routines. The canonicalization attempts to widen shuffles
12153 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
12154 /// s.t. only one of the two inputs needs to be tested, etc.
12155 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
12156                                   SelectionDAG &DAG) {
12157   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12158   ArrayRef<int> Mask = SVOp->getMask();
12159   SDValue V1 = Op.getOperand(0);
12160   SDValue V2 = Op.getOperand(1);
12161   MVT VT = Op.getSimpleValueType();
12162   int NumElements = VT.getVectorNumElements();
12163   SDLoc DL(Op);
12164   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
12165
12166   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
12167          "Can't lower MMX shuffles");
12168
12169   bool V1IsUndef = V1.isUndef();
12170   bool V2IsUndef = V2.isUndef();
12171   if (V1IsUndef && V2IsUndef)
12172     return DAG.getUNDEF(VT);
12173
12174   // When we create a shuffle node we put the UNDEF node to second operand,
12175   // but in some cases the first operand may be transformed to UNDEF.
12176   // In this case we should just commute the node.
12177   if (V1IsUndef)
12178     return DAG.getCommutedVectorShuffle(*SVOp);
12179
12180   // Check for non-undef masks pointing at an undef vector and make the masks
12181   // undef as well. This makes it easier to match the shuffle based solely on
12182   // the mask.
12183   if (V2IsUndef)
12184     for (int M : Mask)
12185       if (M >= NumElements) {
12186         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
12187         for (int &M : NewMask)
12188           if (M >= NumElements)
12189             M = -1;
12190         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
12191       }
12192
12193   // We actually see shuffles that are entirely re-arrangements of a set of
12194   // zero inputs. This mostly happens while decomposing complex shuffles into
12195   // simple ones. Directly lower these as a buildvector of zeros.
12196   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
12197   if (Zeroable.all())
12198     return getZeroVector(VT, Subtarget, DAG, DL);
12199
12200   // Try to collapse shuffles into using a vector type with fewer elements but
12201   // wider element types. We cap this to not form integers or floating point
12202   // elements wider than 64 bits, but it might be interesting to form i128
12203   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
12204   SmallVector<int, 16> WidenedMask;
12205   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
12206       canWidenShuffleElements(Mask, WidenedMask)) {
12207     MVT NewEltVT = VT.isFloatingPoint()
12208                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
12209                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
12210     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12211     // Make sure that the new vector type is legal. For example, v2f64 isn't
12212     // legal on SSE1.
12213     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12214       V1 = DAG.getBitcast(NewVT, V1);
12215       V2 = DAG.getBitcast(NewVT, V2);
12216       return DAG.getBitcast(
12217           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
12218     }
12219   }
12220
12221   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
12222   for (int M : Mask)
12223     if (M < 0)
12224       ++NumUndefElements;
12225     else if (M < NumElements)
12226       ++NumV1Elements;
12227     else
12228       ++NumV2Elements;
12229
12230   // Commute the shuffle as needed such that more elements come from V1 than
12231   // V2. This allows us to match the shuffle pattern strictly on how many
12232   // elements come from V1 without handling the symmetric cases.
12233   if (NumV2Elements > NumV1Elements)
12234     return DAG.getCommutedVectorShuffle(*SVOp);
12235
12236   assert(NumV1Elements > 0 && "No V1 indices");
12237   assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
12238
12239   // When the number of V1 and V2 elements are the same, try to minimize the
12240   // number of uses of V2 in the low half of the vector. When that is tied,
12241   // ensure that the sum of indices for V1 is equal to or lower than the sum
12242   // indices for V2. When those are equal, try to ensure that the number of odd
12243   // indices for V1 is lower than the number of odd indices for V2.
12244   if (NumV1Elements == NumV2Elements) {
12245     int LowV1Elements = 0, LowV2Elements = 0;
12246     for (int M : Mask.slice(0, NumElements / 2))
12247       if (M >= NumElements)
12248         ++LowV2Elements;
12249       else if (M >= 0)
12250         ++LowV1Elements;
12251     if (LowV2Elements > LowV1Elements)
12252       return DAG.getCommutedVectorShuffle(*SVOp);
12253     if (LowV2Elements == LowV1Elements) {
12254       int SumV1Indices = 0, SumV2Indices = 0;
12255       for (int i = 0, Size = Mask.size(); i < Size; ++i)
12256         if (Mask[i] >= NumElements)
12257           SumV2Indices += i;
12258         else if (Mask[i] >= 0)
12259           SumV1Indices += i;
12260       if (SumV2Indices < SumV1Indices)
12261         return DAG.getCommutedVectorShuffle(*SVOp);
12262       if (SumV2Indices == SumV1Indices) {
12263         int NumV1OddIndices = 0, NumV2OddIndices = 0;
12264         for (int i = 0, Size = Mask.size(); i < Size; ++i)
12265           if (Mask[i] >= NumElements)
12266             NumV2OddIndices += i % 2;
12267           else if (Mask[i] >= 0)
12268             NumV1OddIndices += i % 2;
12269         if (NumV2OddIndices < NumV1OddIndices)
12270           return DAG.getCommutedVectorShuffle(*SVOp);
12271       }
12272     }
12273   }
12274
12275   // For each vector width, delegate to a specialized lowering routine.
12276   if (VT.is128BitVector())
12277     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12278
12279   if (VT.is256BitVector())
12280     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12281
12282   if (VT.is512BitVector())
12283     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12284
12285   if (Is1BitVector)
12286     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12287
12288   llvm_unreachable("Unimplemented!");
12289 }
12290
12291 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
12292 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
12293                                            const X86Subtarget &Subtarget,
12294                                            SelectionDAG &DAG) {
12295   SDValue Cond = Op.getOperand(0);
12296   SDValue LHS = Op.getOperand(1);
12297   SDValue RHS = Op.getOperand(2);
12298   SDLoc dl(Op);
12299   MVT VT = Op.getSimpleValueType();
12300
12301   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12302     return SDValue();
12303   auto *CondBV = cast<BuildVectorSDNode>(Cond);
12304
12305   // Only non-legal VSELECTs reach this lowering, convert those into generic
12306   // shuffles and re-use the shuffle lowering path for blends.
12307   SmallVector<int, 32> Mask;
12308   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
12309     SDValue CondElt = CondBV->getOperand(i);
12310     Mask.push_back(
12311         isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
12312                                      : -1);
12313   }
12314   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
12315 }
12316
12317 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12318   // A vselect where all conditions and data are constants can be optimized into
12319   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12320   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12321       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12322       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12323     return SDValue();
12324
12325   // Try to lower this to a blend-style vector shuffle. This can handle all
12326   // constant condition cases.
12327   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
12328     return BlendOp;
12329
12330   // Variable blends are only legal from SSE4.1 onward.
12331   if (!Subtarget.hasSSE41())
12332     return SDValue();
12333
12334   // Only some types will be legal on some subtargets. If we can emit a legal
12335   // VSELECT-matching blend, return Op, and but if we need to expand, return
12336   // a null value.
12337   switch (Op.getSimpleValueType().SimpleTy) {
12338   default:
12339     // Most of the vector types have blends past SSE4.1.
12340     return Op;
12341
12342   case MVT::v32i8:
12343     // The byte blends for AVX vectors were introduced only in AVX2.
12344     if (Subtarget.hasAVX2())
12345       return Op;
12346
12347     return SDValue();
12348
12349   case MVT::v8i16:
12350   case MVT::v16i16:
12351     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
12352     if (Subtarget.hasBWI() && Subtarget.hasVLX())
12353       return Op;
12354
12355     // FIXME: We should custom lower this by fixing the condition and using i8
12356     // blends.
12357     return SDValue();
12358   }
12359 }
12360
12361 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12362   MVT VT = Op.getSimpleValueType();
12363   SDLoc dl(Op);
12364
12365   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12366     return SDValue();
12367
12368   if (VT.getSizeInBits() == 8) {
12369     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12370                                   Op.getOperand(0), Op.getOperand(1));
12371     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12372                                   DAG.getValueType(VT));
12373     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12374   }
12375
12376   if (VT.getSizeInBits() == 16) {
12377     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12378     if (isNullConstant(Op.getOperand(1)))
12379       return DAG.getNode(
12380           ISD::TRUNCATE, dl, MVT::i16,
12381           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12382                       DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12383                       Op.getOperand(1)));
12384     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12385                                   Op.getOperand(0), Op.getOperand(1));
12386     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12387                                   DAG.getValueType(VT));
12388     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12389   }
12390
12391   if (VT == MVT::f32) {
12392     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12393     // the result back to FR32 register. It's only worth matching if the
12394     // result has a single use which is a store or a bitcast to i32.  And in
12395     // the case of a store, it's not worth it if the index is a constant 0,
12396     // because a MOVSSmr can be used instead, which is smaller and faster.
12397     if (!Op.hasOneUse())
12398       return SDValue();
12399     SDNode *User = *Op.getNode()->use_begin();
12400     if ((User->getOpcode() != ISD::STORE ||
12401          isNullConstant(Op.getOperand(1))) &&
12402         (User->getOpcode() != ISD::BITCAST ||
12403          User->getValueType(0) != MVT::i32))
12404       return SDValue();
12405     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12406                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12407                                   Op.getOperand(1));
12408     return DAG.getBitcast(MVT::f32, Extract);
12409   }
12410
12411   if (VT == MVT::i32 || VT == MVT::i64) {
12412     // ExtractPS/pextrq works with constant index.
12413     if (isa<ConstantSDNode>(Op.getOperand(1)))
12414       return Op;
12415   }
12416   return SDValue();
12417 }
12418
12419 /// Extract one bit from mask vector, like v16i1 or v8i1.
12420 /// AVX-512 feature.
12421 SDValue
12422 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12423   SDValue Vec = Op.getOperand(0);
12424   SDLoc dl(Vec);
12425   MVT VecVT = Vec.getSimpleValueType();
12426   SDValue Idx = Op.getOperand(1);
12427   MVT EltVT = Op.getSimpleValueType();
12428
12429   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12430   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
12431          "Unexpected vector type in ExtractBitFromMaskVector");
12432
12433   // variable index can't be handled in mask registers,
12434   // extend vector to VR512
12435   if (!isa<ConstantSDNode>(Idx)) {
12436     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12437     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12438     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12439                               ExtVT.getVectorElementType(), Ext, Idx);
12440     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12441   }
12442
12443   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12444   if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
12445     // Use kshiftlw/rw instruction.
12446     VecVT = MVT::v16i1;
12447     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
12448                       DAG.getUNDEF(VecVT),
12449                       Vec,
12450                       DAG.getIntPtrConstant(0, dl));
12451   }
12452   unsigned MaxSift = VecVT.getVectorNumElements() - 1;
12453   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12454                     DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
12455   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12456                     DAG.getConstant(MaxSift, dl, MVT::i8));
12457   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12458                        DAG.getIntPtrConstant(0, dl));
12459 }
12460
12461 SDValue
12462 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12463                                            SelectionDAG &DAG) const {
12464   SDLoc dl(Op);
12465   SDValue Vec = Op.getOperand(0);
12466   MVT VecVT = Vec.getSimpleValueType();
12467   SDValue Idx = Op.getOperand(1);
12468
12469   if (Op.getSimpleValueType() == MVT::i1)
12470     return ExtractBitFromMaskVector(Op, DAG);
12471
12472   if (!isa<ConstantSDNode>(Idx)) {
12473     if (VecVT.is512BitVector() ||
12474         (VecVT.is256BitVector() && Subtarget.hasInt256() &&
12475          VecVT.getVectorElementType().getSizeInBits() == 32)) {
12476
12477       MVT MaskEltVT =
12478         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12479       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12480                                     MaskEltVT.getSizeInBits());
12481
12482       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12483       auto PtrVT = getPointerTy(DAG.getDataLayout());
12484       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12485                                  getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
12486                                  DAG.getConstant(0, dl, PtrVT));
12487       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12488       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
12489                          DAG.getConstant(0, dl, PtrVT));
12490     }
12491     return SDValue();
12492   }
12493
12494   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12495
12496   // If this is a 256-bit vector result, first extract the 128-bit vector and
12497   // then extract the element from the 128-bit vector.
12498   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12499     // Get the 128-bit vector.
12500     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
12501     MVT EltVT = VecVT.getVectorElementType();
12502
12503     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12504     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
12505
12506     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
12507     // this can be done with a mask.
12508     IdxVal &= ElemsPerChunk - 1;
12509     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12510                        DAG.getConstant(IdxVal, dl, MVT::i32));
12511   }
12512
12513   assert(VecVT.is128BitVector() && "Unexpected vector length");
12514
12515   if (Subtarget.hasSSE41())
12516     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
12517       return Res;
12518
12519   MVT VT = Op.getSimpleValueType();
12520   // TODO: handle v16i8.
12521   if (VT.getSizeInBits() == 16) {
12522     if (IdxVal == 0)
12523       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12524                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12525                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
12526
12527     // Transform it so it match pextrw which produces a 32-bit result.
12528     MVT EltVT = MVT::i32;
12529     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
12530     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12531                                   DAG.getValueType(VT));
12532     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12533   }
12534
12535   if (VT.getSizeInBits() == 32) {
12536     if (IdxVal == 0)
12537       return Op;
12538
12539     // SHUFPS the element to the lowest double word, then movss.
12540     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
12541     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12542     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12543                        DAG.getIntPtrConstant(0, dl));
12544   }
12545
12546   if (VT.getSizeInBits() == 64) {
12547     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12548     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12549     //        to match extract_elt for f64.
12550     if (IdxVal == 0)
12551       return Op;
12552
12553     // UNPCKHPD the element to the lowest double word, then movsd.
12554     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12555     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12556     int Mask[2] = { 1, -1 };
12557     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12558     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12559                        DAG.getIntPtrConstant(0, dl));
12560   }
12561
12562   return SDValue();
12563 }
12564
12565 /// Insert one bit to mask vector, like v16i1 or v8i1.
12566 /// AVX-512 feature.
12567 SDValue
12568 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12569   SDLoc dl(Op);
12570   SDValue Vec = Op.getOperand(0);
12571   SDValue Elt = Op.getOperand(1);
12572   SDValue Idx = Op.getOperand(2);
12573   MVT VecVT = Vec.getSimpleValueType();
12574
12575   if (!isa<ConstantSDNode>(Idx)) {
12576     // Non constant index. Extend source and destination,
12577     // insert element and then truncate the result.
12578     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12579     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
12580     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
12581       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
12582       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
12583     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
12584   }
12585
12586   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12587   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
12588   if (IdxVal)
12589     EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12590                            DAG.getConstant(IdxVal, dl, MVT::i8));
12591   if (Vec.isUndef())
12592     return EltInVec;
12593   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
12594 }
12595
12596 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12597                                                   SelectionDAG &DAG) const {
12598   MVT VT = Op.getSimpleValueType();
12599   MVT EltVT = VT.getVectorElementType();
12600   unsigned NumElts = VT.getVectorNumElements();
12601
12602   if (EltVT == MVT::i1)
12603     return InsertBitToMaskVector(Op, DAG);
12604
12605   SDLoc dl(Op);
12606   SDValue N0 = Op.getOperand(0);
12607   SDValue N1 = Op.getOperand(1);
12608   SDValue N2 = Op.getOperand(2);
12609   if (!isa<ConstantSDNode>(N2))
12610     return SDValue();
12611   auto *N2C = cast<ConstantSDNode>(N2);
12612   unsigned IdxVal = N2C->getZExtValue();
12613
12614   // If we are clearing out a element, we do this more efficiently with a
12615   // blend shuffle than a costly integer insertion.
12616   // TODO: would other rematerializable values (e.g. allbits) benefit as well?
12617   // TODO: pre-SSE41 targets will tend to use bit masking - this could still
12618   // be beneficial if we are inserting several zeros and can combine the masks.
12619   if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
12620     SmallVector<int, 8> ClearMask;
12621     for (unsigned i = 0; i != NumElts; ++i)
12622       ClearMask.push_back(i == IdxVal ? i + NumElts : i);
12623     SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
12624     return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
12625   }
12626
12627   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
12628   // into that, and then insert the subvector back into the result.
12629   if (VT.is256BitVector() || VT.is512BitVector()) {
12630     // With a 256-bit vector, we can insert into the zero element efficiently
12631     // using a blend if we have AVX or AVX2 and the right data type.
12632     if (VT.is256BitVector() && IdxVal == 0) {
12633       // TODO: It is worthwhile to cast integer to floating point and back
12634       // and incur a domain crossing penalty if that's what we'll end up
12635       // doing anyway after extracting to a 128-bit vector.
12636       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12637           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
12638         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
12639         N2 = DAG.getIntPtrConstant(1, dl);
12640         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
12641       }
12642     }
12643
12644     // Get the desired 128-bit vector chunk.
12645     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
12646
12647     // Insert the element into the desired chunk.
12648     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
12649     assert(isPowerOf2_32(NumEltsIn128));
12650     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
12651     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
12652
12653     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
12654                     DAG.getConstant(IdxIn128, dl, MVT::i32));
12655
12656     // Insert the changed part back into the bigger vector
12657     return insert128BitVector(N0, V, IdxVal, DAG, dl);
12658   }
12659   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
12660
12661   if (Subtarget.hasSSE41()) {
12662     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
12663       unsigned Opc;
12664       if (VT == MVT::v8i16) {
12665         Opc = X86ISD::PINSRW;
12666       } else {
12667         assert(VT == MVT::v16i8);
12668         Opc = X86ISD::PINSRB;
12669       }
12670
12671       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
12672       // argument.
12673       if (N1.getValueType() != MVT::i32)
12674         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12675       if (N2.getValueType() != MVT::i32)
12676         N2 = DAG.getIntPtrConstant(IdxVal, dl);
12677       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
12678     }
12679
12680     if (EltVT == MVT::f32) {
12681       // Bits [7:6] of the constant are the source select. This will always be
12682       //   zero here. The DAG Combiner may combine an extract_elt index into
12683       //   these bits. For example (insert (extract, 3), 2) could be matched by
12684       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
12685       // Bits [5:4] of the constant are the destination select. This is the
12686       //   value of the incoming immediate.
12687       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
12688       //   combine either bitwise AND or insert of float 0.0 to set these bits.
12689
12690       bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
12691       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
12692         // If this is an insertion of 32-bits into the low 32-bits of
12693         // a vector, we prefer to generate a blend with immediate rather
12694         // than an insertps. Blends are simpler operations in hardware and so
12695         // will always have equal or better performance than insertps.
12696         // But if optimizing for size and there's a load folding opportunity,
12697         // generate insertps because blendps does not have a 32-bit memory
12698         // operand form.
12699         N2 = DAG.getIntPtrConstant(1, dl);
12700         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12701         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
12702       }
12703       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
12704       // Create this as a scalar to vector..
12705       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12706       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
12707     }
12708
12709     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
12710       // PINSR* works with constant index.
12711       return Op;
12712     }
12713   }
12714
12715   if (EltVT == MVT::i8)
12716     return SDValue();
12717
12718   if (EltVT.getSizeInBits() == 16) {
12719     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
12720     // as its second argument.
12721     if (N1.getValueType() != MVT::i32)
12722       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12723     if (N2.getValueType() != MVT::i32)
12724       N2 = DAG.getIntPtrConstant(IdxVal, dl);
12725     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
12726   }
12727   return SDValue();
12728 }
12729
12730 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
12731   SDLoc dl(Op);
12732   MVT OpVT = Op.getSimpleValueType();
12733
12734   // If this is a 256-bit vector result, first insert into a 128-bit
12735   // vector and then insert into the 256-bit vector.
12736   if (!OpVT.is128BitVector()) {
12737     // Insert into a 128-bit vector.
12738     unsigned SizeFactor = OpVT.getSizeInBits()/128;
12739     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
12740                                  OpVT.getVectorNumElements() / SizeFactor);
12741
12742     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
12743
12744     // Insert the 128-bit vector.
12745     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
12746   }
12747
12748   if (OpVT == MVT::v1i64 &&
12749       Op.getOperand(0).getValueType() == MVT::i64)
12750     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
12751
12752   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
12753   assert(OpVT.is128BitVector() && "Expected an SSE type!");
12754   return DAG.getBitcast(
12755       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
12756 }
12757
12758 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
12759 // a simple subregister reference or explicit instructions to grab
12760 // upper bits of a vector.
12761 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12762                                       SelectionDAG &DAG) {
12763   SDLoc dl(Op);
12764   SDValue In =  Op.getOperand(0);
12765   SDValue Idx = Op.getOperand(1);
12766   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12767   MVT ResVT   = Op.getSimpleValueType();
12768   MVT InVT    = In.getSimpleValueType();
12769
12770   if (Subtarget.hasFp256()) {
12771     if (ResVT.is128BitVector() &&
12772         (InVT.is256BitVector() || InVT.is512BitVector()) &&
12773         isa<ConstantSDNode>(Idx)) {
12774       return extract128BitVector(In, IdxVal, DAG, dl);
12775     }
12776     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
12777         isa<ConstantSDNode>(Idx)) {
12778       return extract256BitVector(In, IdxVal, DAG, dl);
12779     }
12780   }
12781   return SDValue();
12782 }
12783
12784 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
12785 // simple superregister reference or explicit instructions to insert
12786 // the upper bits of a vector.
12787 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12788                                      SelectionDAG &DAG) {
12789   if (!Subtarget.hasAVX())
12790     return SDValue();
12791
12792   SDLoc dl(Op);
12793   SDValue Vec = Op.getOperand(0);
12794   SDValue SubVec = Op.getOperand(1);
12795   SDValue Idx = Op.getOperand(2);
12796
12797   if (!isa<ConstantSDNode>(Idx))
12798     return SDValue();
12799
12800   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12801   MVT OpVT = Op.getSimpleValueType();
12802   MVT SubVecVT = SubVec.getSimpleValueType();
12803
12804   // Fold two 16-byte subvector loads into one 32-byte load:
12805   // (insert_subvector (insert_subvector undef, (load addr), 0),
12806   //                   (load addr + 16), Elts/2)
12807   // --> load32 addr
12808   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
12809       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
12810       OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
12811     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
12812     if (Idx2 && Idx2->getZExtValue() == 0) {
12813       // If needed, look through bitcasts to get to the load.
12814       SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
12815       if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
12816         bool Fast;
12817         unsigned Alignment = FirstLd->getAlignment();
12818         unsigned AS = FirstLd->getAddressSpace();
12819         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
12820         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
12821                                     OpVT, AS, Alignment, &Fast) && Fast) {
12822           SDValue Ops[] = { SubVec2, SubVec };
12823           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
12824             return Ld;
12825         }
12826       }
12827     }
12828   }
12829
12830   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
12831       SubVecVT.is128BitVector())
12832     return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
12833
12834   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
12835     return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
12836
12837   if (OpVT.getVectorElementType() == MVT::i1)
12838     return insert1BitVector(Op, DAG, Subtarget);
12839
12840   return SDValue();
12841 }
12842
12843 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
12844 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
12845 // one of the above mentioned nodes. It has to be wrapped because otherwise
12846 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
12847 // be used to form addressing mode. These wrapped nodes will be selected
12848 // into MOV32ri.
12849 SDValue
12850 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
12851   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12852
12853   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12854   // global base reg.
12855   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12856   unsigned WrapperKind = X86ISD::Wrapper;
12857   CodeModel::Model M = DAG.getTarget().getCodeModel();
12858
12859   if (Subtarget.isPICStyleRIPRel() &&
12860       (M == CodeModel::Small || M == CodeModel::Kernel))
12861     WrapperKind = X86ISD::WrapperRIP;
12862
12863   auto PtrVT = getPointerTy(DAG.getDataLayout());
12864   SDValue Result = DAG.getTargetConstantPool(
12865       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
12866   SDLoc DL(CP);
12867   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12868   // With PIC, the address is actually $g + Offset.
12869   if (OpFlag) {
12870     Result =
12871         DAG.getNode(ISD::ADD, DL, PtrVT,
12872                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12873   }
12874
12875   return Result;
12876 }
12877
12878 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
12879   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12880
12881   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12882   // global base reg.
12883   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12884   unsigned WrapperKind = X86ISD::Wrapper;
12885   CodeModel::Model M = DAG.getTarget().getCodeModel();
12886
12887   if (Subtarget.isPICStyleRIPRel() &&
12888       (M == CodeModel::Small || M == CodeModel::Kernel))
12889     WrapperKind = X86ISD::WrapperRIP;
12890
12891   auto PtrVT = getPointerTy(DAG.getDataLayout());
12892   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
12893   SDLoc DL(JT);
12894   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12895
12896   // With PIC, the address is actually $g + Offset.
12897   if (OpFlag)
12898     Result =
12899         DAG.getNode(ISD::ADD, DL, PtrVT,
12900                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12901
12902   return Result;
12903 }
12904
12905 SDValue
12906 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
12907   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
12908
12909   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12910   // global base reg.
12911   const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
12912   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
12913   unsigned WrapperKind = X86ISD::Wrapper;
12914   CodeModel::Model M = DAG.getTarget().getCodeModel();
12915
12916   if (Subtarget.isPICStyleRIPRel() &&
12917       (M == CodeModel::Small || M == CodeModel::Kernel))
12918     WrapperKind = X86ISD::WrapperRIP;
12919
12920   auto PtrVT = getPointerTy(DAG.getDataLayout());
12921   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
12922
12923   SDLoc DL(Op);
12924   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12925
12926   // With PIC, the address is actually $g + Offset.
12927   if (isPositionIndependent() && !Subtarget.is64Bit()) {
12928     Result =
12929         DAG.getNode(ISD::ADD, DL, PtrVT,
12930                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12931   }
12932
12933   // For symbols that require a load from a stub to get the address, emit the
12934   // load.
12935   if (isGlobalStubReference(OpFlag))
12936     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
12937                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
12938
12939   return Result;
12940 }
12941
12942 SDValue
12943 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
12944   // Create the TargetBlockAddressAddress node.
12945   unsigned char OpFlags =
12946     Subtarget.classifyBlockAddressReference();
12947   CodeModel::Model M = DAG.getTarget().getCodeModel();
12948   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
12949   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
12950   SDLoc dl(Op);
12951   auto PtrVT = getPointerTy(DAG.getDataLayout());
12952   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
12953
12954   if (Subtarget.isPICStyleRIPRel() &&
12955       (M == CodeModel::Small || M == CodeModel::Kernel))
12956     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12957   else
12958     Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12959
12960   // With PIC, the address is actually $g + Offset.
12961   if (isGlobalRelativeToPICBase(OpFlags)) {
12962     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12963                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12964   }
12965
12966   return Result;
12967 }
12968
12969 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
12970                                               const SDLoc &dl, int64_t Offset,
12971                                               SelectionDAG &DAG) const {
12972   // Create the TargetGlobalAddress node, folding in the constant
12973   // offset if it is legal.
12974   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
12975   CodeModel::Model M = DAG.getTarget().getCodeModel();
12976   auto PtrVT = getPointerTy(DAG.getDataLayout());
12977   SDValue Result;
12978   if (OpFlags == X86II::MO_NO_FLAG &&
12979       X86::isOffsetSuitableForCodeModel(Offset, M)) {
12980     // A direct static reference to a global.
12981     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
12982     Offset = 0;
12983   } else {
12984     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
12985   }
12986
12987   if (Subtarget.isPICStyleRIPRel() &&
12988       (M == CodeModel::Small || M == CodeModel::Kernel))
12989     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12990   else
12991     Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12992
12993   // With PIC, the address is actually $g + Offset.
12994   if (isGlobalRelativeToPICBase(OpFlags)) {
12995     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12996                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12997   }
12998
12999   // For globals that require a load from a stub to get the address, emit the
13000   // load.
13001   if (isGlobalStubReference(OpFlags))
13002     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
13003                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
13004
13005   // If there was a non-zero offset that we didn't fold, create an explicit
13006   // addition for it.
13007   if (Offset != 0)
13008     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
13009                          DAG.getConstant(Offset, dl, PtrVT));
13010
13011   return Result;
13012 }
13013
13014 SDValue
13015 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13016   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13017   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13018   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13019 }
13020
13021 static SDValue
13022 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13023            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13024            unsigned char OperandFlags, bool LocalDynamic = false) {
13025   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13026   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13027   SDLoc dl(GA);
13028   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13029                                            GA->getValueType(0),
13030                                            GA->getOffset(),
13031                                            OperandFlags);
13032
13033   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13034                                            : X86ISD::TLSADDR;
13035
13036   if (InFlag) {
13037     SDValue Ops[] = { Chain,  TGA, *InFlag };
13038     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13039   } else {
13040     SDValue Ops[]  = { Chain, TGA };
13041     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13042   }
13043
13044   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13045   MFI->setAdjustsStack(true);
13046   MFI->setHasCalls(true);
13047
13048   SDValue Flag = Chain.getValue(1);
13049   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13050 }
13051
13052 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13053 static SDValue
13054 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13055                                 const EVT PtrVT) {
13056   SDValue InFlag;
13057   SDLoc dl(GA);  // ? function entry point might be better
13058   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13059                                    DAG.getNode(X86ISD::GlobalBaseReg,
13060                                                SDLoc(), PtrVT), InFlag);
13061   InFlag = Chain.getValue(1);
13062
13063   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13064 }
13065
13066 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13067 static SDValue
13068 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13069                                 const EVT PtrVT) {
13070   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13071                     X86::RAX, X86II::MO_TLSGD);
13072 }
13073
13074 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13075                                            SelectionDAG &DAG,
13076                                            const EVT PtrVT,
13077                                            bool is64Bit) {
13078   SDLoc dl(GA);
13079
13080   // Get the start address of the TLS block for this module.
13081   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13082       .getInfo<X86MachineFunctionInfo>();
13083   MFI->incNumLocalDynamicTLSAccesses();
13084
13085   SDValue Base;
13086   if (is64Bit) {
13087     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13088                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13089   } else {
13090     SDValue InFlag;
13091     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13092         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13093     InFlag = Chain.getValue(1);
13094     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13095                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13096   }
13097
13098   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13099   // of Base.
13100
13101   // Build x@dtpoff.
13102   unsigned char OperandFlags = X86II::MO_DTPOFF;
13103   unsigned WrapperKind = X86ISD::Wrapper;
13104   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13105                                            GA->getValueType(0),
13106                                            GA->getOffset(), OperandFlags);
13107   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13108
13109   // Add x@dtpoff with the base.
13110   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13111 }
13112
13113 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13114 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13115                                    const EVT PtrVT, TLSModel::Model model,
13116                                    bool is64Bit, bool isPIC) {
13117   SDLoc dl(GA);
13118
13119   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13120   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13121                                                          is64Bit ? 257 : 256));
13122
13123   SDValue ThreadPointer =
13124       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
13125                   MachinePointerInfo(Ptr));
13126
13127   unsigned char OperandFlags = 0;
13128   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13129   // initialexec.
13130   unsigned WrapperKind = X86ISD::Wrapper;
13131   if (model == TLSModel::LocalExec) {
13132     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13133   } else if (model == TLSModel::InitialExec) {
13134     if (is64Bit) {
13135       OperandFlags = X86II::MO_GOTTPOFF;
13136       WrapperKind = X86ISD::WrapperRIP;
13137     } else {
13138       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13139     }
13140   } else {
13141     llvm_unreachable("Unexpected model");
13142   }
13143
13144   // emit "addl x@ntpoff,%eax" (local exec)
13145   // or "addl x@indntpoff,%eax" (initial exec)
13146   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13147   SDValue TGA =
13148       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13149                                  GA->getOffset(), OperandFlags);
13150   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13151
13152   if (model == TLSModel::InitialExec) {
13153     if (isPIC && !is64Bit) {
13154       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13155                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13156                            Offset);
13157     }
13158
13159     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13160                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
13161   }
13162
13163   // The address of the thread local variable is the add of the thread
13164   // pointer with the offset of the variable.
13165   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13166 }
13167
13168 SDValue
13169 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13170
13171   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13172
13173   if (DAG.getTarget().Options.EmulatedTLS)
13174     return LowerToTLSEmulatedModel(GA, DAG);
13175
13176   const GlobalValue *GV = GA->getGlobal();
13177   auto PtrVT = getPointerTy(DAG.getDataLayout());
13178   bool PositionIndependent = isPositionIndependent();
13179
13180   if (Subtarget.isTargetELF()) {
13181     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13182     switch (model) {
13183       case TLSModel::GeneralDynamic:
13184         if (Subtarget.is64Bit())
13185           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
13186         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
13187       case TLSModel::LocalDynamic:
13188         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
13189                                            Subtarget.is64Bit());
13190       case TLSModel::InitialExec:
13191       case TLSModel::LocalExec:
13192         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
13193                                    PositionIndependent);
13194     }
13195     llvm_unreachable("Unknown TLS model.");
13196   }
13197
13198   if (Subtarget.isTargetDarwin()) {
13199     // Darwin only has one model of TLS.  Lower to that.
13200     unsigned char OpFlag = 0;
13201     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
13202                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13203
13204     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13205     // global base reg.
13206     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
13207     if (PIC32)
13208       OpFlag = X86II::MO_TLVP_PIC_BASE;
13209     else
13210       OpFlag = X86II::MO_TLVP;
13211     SDLoc DL(Op);
13212     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13213                                                 GA->getValueType(0),
13214                                                 GA->getOffset(), OpFlag);
13215     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
13216
13217     // With PIC32, the address is actually $g + Offset.
13218     if (PIC32)
13219       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
13220                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13221                            Offset);
13222
13223     // Lowering the machine isd will make sure everything is in the right
13224     // location.
13225     SDValue Chain = DAG.getEntryNode();
13226     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13227     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
13228     SDValue Args[] = { Chain, Offset };
13229     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13230     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
13231                                DAG.getIntPtrConstant(0, DL, true),
13232                                Chain.getValue(1), DL);
13233
13234     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13235     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13236     MFI->setAdjustsStack(true);
13237
13238     // And our return value (tls address) is in the standard call return value
13239     // location.
13240     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
13241     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
13242   }
13243
13244   if (Subtarget.isTargetKnownWindowsMSVC() ||
13245       Subtarget.isTargetWindowsItanium() ||
13246       Subtarget.isTargetWindowsGNU()) {
13247     // Just use the implicit TLS architecture
13248     // Need to generate someting similar to:
13249     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13250     //                                  ; from TEB
13251     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13252     //   mov     rcx, qword [rdx+rcx*8]
13253     //   mov     eax, .tls$:tlsvar
13254     //   [rax+rcx] contains the address
13255     // Windows 64bit: gs:0x58
13256     // Windows 32bit: fs:__tls_array
13257
13258     SDLoc dl(GA);
13259     SDValue Chain = DAG.getEntryNode();
13260
13261     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13262     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13263     // use its literal value of 0x2C.
13264     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
13265                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13266                                                              256)
13267                                         : Type::getInt32PtrTy(*DAG.getContext(),
13268                                                               257));
13269
13270     SDValue TlsArray = Subtarget.is64Bit()
13271                            ? DAG.getIntPtrConstant(0x58, dl)
13272                            : (Subtarget.isTargetWindowsGNU()
13273                                   ? DAG.getIntPtrConstant(0x2C, dl)
13274                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
13275
13276     SDValue ThreadPointer =
13277         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
13278
13279     SDValue res;
13280     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
13281       res = ThreadPointer;
13282     } else {
13283       // Load the _tls_index variable
13284       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
13285       if (Subtarget.is64Bit())
13286         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
13287                              MachinePointerInfo(), MVT::i32);
13288       else
13289         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
13290
13291       auto &DL = DAG.getDataLayout();
13292       SDValue Scale =
13293           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
13294       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
13295
13296       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
13297     }
13298
13299     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
13300
13301     // Get the offset of start of .tls section
13302     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13303                                              GA->getValueType(0),
13304                                              GA->getOffset(), X86II::MO_SECREL);
13305     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
13306
13307     // The address of the thread local variable is the add of the thread
13308     // pointer with the offset of the variable.
13309     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
13310   }
13311
13312   llvm_unreachable("TLS not implemented for this target.");
13313 }
13314
13315 /// Lower SRA_PARTS and friends, which return two i32 values
13316 /// and take a 2 x i32 value to shift plus a shift amount.
13317 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13318   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13319   MVT VT = Op.getSimpleValueType();
13320   unsigned VTBits = VT.getSizeInBits();
13321   SDLoc dl(Op);
13322   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13323   SDValue ShOpLo = Op.getOperand(0);
13324   SDValue ShOpHi = Op.getOperand(1);
13325   SDValue ShAmt  = Op.getOperand(2);
13326   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13327   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13328   // during isel.
13329   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13330                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
13331   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13332                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
13333                        : DAG.getConstant(0, dl, VT);
13334
13335   SDValue Tmp2, Tmp3;
13336   if (Op.getOpcode() == ISD::SHL_PARTS) {
13337     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13338     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13339   } else {
13340     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13341     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13342   }
13343
13344   // If the shift amount is larger or equal than the width of a part we can't
13345   // rely on the results of shld/shrd. Insert a test and select the appropriate
13346   // values for large shift amounts.
13347   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13348                                 DAG.getConstant(VTBits, dl, MVT::i8));
13349   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13350                              AndNode, DAG.getConstant(0, dl, MVT::i8));
13351
13352   SDValue Hi, Lo;
13353   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
13354   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13355   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13356
13357   if (Op.getOpcode() == ISD::SHL_PARTS) {
13358     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13359     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13360   } else {
13361     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13362     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13363   }
13364
13365   SDValue Ops[2] = { Lo, Hi };
13366   return DAG.getMergeValues(Ops, dl);
13367 }
13368
13369 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13370                                            SelectionDAG &DAG) const {
13371   SDValue Src = Op.getOperand(0);
13372   MVT SrcVT = Src.getSimpleValueType();
13373   MVT VT = Op.getSimpleValueType();
13374   SDLoc dl(Op);
13375
13376   if (SrcVT.isVector()) {
13377     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
13378       return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
13379                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
13380                          DAG.getUNDEF(SrcVT)));
13381     }
13382     if (SrcVT.getVectorElementType() == MVT::i1) {
13383       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13384       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13385                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
13386     }
13387     return SDValue();
13388   }
13389
13390   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13391          "Unknown SINT_TO_FP to lower!");
13392
13393   // These are really Legal; return the operand so the caller accepts it as
13394   // Legal.
13395   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13396     return Op;
13397   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13398       Subtarget.is64Bit()) {
13399     return Op;
13400   }
13401
13402   SDValue ValueToStore = Op.getOperand(0);
13403   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13404       !Subtarget.is64Bit())
13405     // Bitcasting to f64 here allows us to do a single 64-bit store from
13406     // an SSE register, avoiding the store forwarding penalty that would come
13407     // with two 32-bit stores.
13408     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13409
13410   unsigned Size = SrcVT.getSizeInBits()/8;
13411   MachineFunction &MF = DAG.getMachineFunction();
13412   auto PtrVT = getPointerTy(MF.getDataLayout());
13413   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13414   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13415   SDValue Chain = DAG.getStore(
13416       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
13417       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
13418   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13419 }
13420
13421 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13422                                      SDValue StackSlot,
13423                                      SelectionDAG &DAG) const {
13424   // Build the FILD
13425   SDLoc DL(Op);
13426   SDVTList Tys;
13427   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13428   if (useSSE)
13429     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13430   else
13431     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13432
13433   unsigned ByteSize = SrcVT.getSizeInBits()/8;
13434
13435   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13436   MachineMemOperand *MMO;
13437   if (FI) {
13438     int SSFI = FI->getIndex();
13439     MMO = DAG.getMachineFunction().getMachineMemOperand(
13440         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13441         MachineMemOperand::MOLoad, ByteSize, ByteSize);
13442   } else {
13443     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13444     StackSlot = StackSlot.getOperand(1);
13445   }
13446   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13447   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13448                                            X86ISD::FILD, DL,
13449                                            Tys, Ops, SrcVT, MMO);
13450
13451   if (useSSE) {
13452     Chain = Result.getValue(1);
13453     SDValue InFlag = Result.getValue(2);
13454
13455     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13456     // shouldn't be necessary except that RFP cannot be live across
13457     // multiple blocks. When stackifier is fixed, they can be uncoupled.
13458     MachineFunction &MF = DAG.getMachineFunction();
13459     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13460     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13461     auto PtrVT = getPointerTy(MF.getDataLayout());
13462     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13463     Tys = DAG.getVTList(MVT::Other);
13464     SDValue Ops[] = {
13465       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13466     };
13467     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13468         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13469         MachineMemOperand::MOStore, SSFISize, SSFISize);
13470
13471     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13472                                     Ops, Op.getValueType(), MMO);
13473     Result = DAG.getLoad(
13474         Op.getValueType(), DL, Chain, StackSlot,
13475         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
13476   }
13477
13478   return Result;
13479 }
13480
13481 /// 64-bit unsigned integer to double expansion.
13482 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13483                                                SelectionDAG &DAG) const {
13484   // This algorithm is not obvious. Here it is what we're trying to output:
13485   /*
13486      movq       %rax,  %xmm0
13487      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13488      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13489      #ifdef __SSE3__
13490        haddpd   %xmm0, %xmm0
13491      #else
13492        pshufd   $0x4e, %xmm0, %xmm1
13493        addpd    %xmm1, %xmm0
13494      #endif
13495   */
13496
13497   SDLoc dl(Op);
13498   LLVMContext *Context = DAG.getContext();
13499
13500   // Build some magic constants.
13501   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13502   Constant *C0 = ConstantDataVector::get(*Context, CV0);
13503   auto PtrVT = getPointerTy(DAG.getDataLayout());
13504   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
13505
13506   SmallVector<Constant*,2> CV1;
13507   CV1.push_back(
13508     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13509                                       APInt(64, 0x4330000000000000ULL))));
13510   CV1.push_back(
13511     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13512                                       APInt(64, 0x4530000000000000ULL))));
13513   Constant *C1 = ConstantVector::get(CV1);
13514   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
13515
13516   // Load the 64-bit value into an XMM register.
13517   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13518                             Op.getOperand(0));
13519   SDValue CLod0 =
13520       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13521                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13522                   /* Alignment = */ 16);
13523   SDValue Unpck1 =
13524       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
13525
13526   SDValue CLod1 =
13527       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13528                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13529                   /* Alignment = */ 16);
13530   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
13531   // TODO: Are there any fast-math-flags to propagate here?
13532   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13533   SDValue Result;
13534
13535   if (Subtarget.hasSSE3()) {
13536     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13537     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13538   } else {
13539     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
13540     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13541                                            S2F, 0x4E, DAG);
13542     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13543                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
13544   }
13545
13546   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13547                      DAG.getIntPtrConstant(0, dl));
13548 }
13549
13550 /// 32-bit unsigned integer to float expansion.
13551 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13552                                                SelectionDAG &DAG) const {
13553   SDLoc dl(Op);
13554   // FP constant to bias correct the final result.
13555   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
13556                                    MVT::f64);
13557
13558   // Load the 32-bit value into an XMM register.
13559   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13560                              Op.getOperand(0));
13561
13562   // Zero out the upper parts of the register.
13563   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13564
13565   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13566                      DAG.getBitcast(MVT::v2f64, Load),
13567                      DAG.getIntPtrConstant(0, dl));
13568
13569   // Or the load with the bias.
13570   SDValue Or = DAG.getNode(
13571       ISD::OR, dl, MVT::v2i64,
13572       DAG.getBitcast(MVT::v2i64,
13573                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
13574       DAG.getBitcast(MVT::v2i64,
13575                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
13576   Or =
13577       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13578                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
13579
13580   // Subtract the bias.
13581   // TODO: Are there any fast-math-flags to propagate here?
13582   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13583
13584   // Handle final rounding.
13585   MVT DestVT = Op.getSimpleValueType();
13586
13587   if (DestVT.bitsLT(MVT::f64))
13588     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13589                        DAG.getIntPtrConstant(0, dl));
13590   if (DestVT.bitsGT(MVT::f64))
13591     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13592
13593   // Handle final rounding.
13594   return Sub;
13595 }
13596
13597 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13598                                      const X86Subtarget &Subtarget) {
13599   // The algorithm is the following:
13600   // #ifdef __SSE4_1__
13601   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13602   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13603   //                                 (uint4) 0x53000000, 0xaa);
13604   // #else
13605   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13606   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13607   // #endif
13608   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13609   //     return (float4) lo + fhi;
13610
13611   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
13612   // reassociate the two FADDs, and if we do that, the algorithm fails
13613   // spectacularly (PR24512).
13614   // FIXME: If we ever have some kind of Machine FMF, this should be marked
13615   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
13616   // there's also the MachineCombiner reassociations happening on Machine IR.
13617   if (DAG.getTarget().Options.UnsafeFPMath)
13618     return SDValue();
13619
13620   SDLoc DL(Op);
13621   SDValue V = Op->getOperand(0);
13622   MVT VecIntVT = V.getSimpleValueType();
13623   bool Is128 = VecIntVT == MVT::v4i32;
13624   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13625   // If we convert to something else than the supported type, e.g., to v4f64,
13626   // abort early.
13627   if (VecFloatVT != Op->getSimpleValueType(0))
13628     return SDValue();
13629
13630   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13631          "Unsupported custom type");
13632
13633   // In the #idef/#else code, we have in common:
13634   // - The vector of constants:
13635   // -- 0x4b000000
13636   // -- 0x53000000
13637   // - A shift:
13638   // -- v >> 16
13639
13640   // Create the splat vector for 0x4b000000.
13641   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
13642   // Create the splat vector for 0x53000000.
13643   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
13644
13645   // Create the right shift.
13646   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
13647   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
13648
13649   SDValue Low, High;
13650   if (Subtarget.hasSSE41()) {
13651     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
13652     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13653     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
13654     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
13655     // Low will be bitcasted right away, so do not bother bitcasting back to its
13656     // original type.
13657     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
13658                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13659     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13660     //                                 (uint4) 0x53000000, 0xaa);
13661     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
13662     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
13663     // High will be bitcasted right away, so do not bother bitcasting back to
13664     // its original type.
13665     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
13666                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13667   } else {
13668     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
13669     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13670     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
13671     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
13672
13673     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13674     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
13675   }
13676
13677   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
13678   SDValue VecCstFAdd = DAG.getConstantFP(
13679       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
13680
13681   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13682   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
13683   // TODO: Are there any fast-math-flags to propagate here?
13684   SDValue FHigh =
13685       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
13686   //     return (float4) lo + fhi;
13687   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
13688   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
13689 }
13690
13691 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
13692                                                SelectionDAG &DAG) const {
13693   SDValue N0 = Op.getOperand(0);
13694   MVT SVT = N0.getSimpleValueType();
13695   SDLoc dl(Op);
13696
13697   switch (SVT.SimpleTy) {
13698   default:
13699     llvm_unreachable("Custom UINT_TO_FP is not supported!");
13700   case MVT::v4i8:
13701   case MVT::v4i16:
13702   case MVT::v8i8:
13703   case MVT::v8i16: {
13704     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
13705     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13706                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
13707   }
13708   case MVT::v4i32:
13709   case MVT::v8i32:
13710     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
13711   case MVT::v16i8:
13712   case MVT::v16i16:
13713     assert(Subtarget.hasAVX512());
13714     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
13715                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
13716   }
13717 }
13718
13719 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
13720                                            SelectionDAG &DAG) const {
13721   SDValue N0 = Op.getOperand(0);
13722   SDLoc dl(Op);
13723   auto PtrVT = getPointerTy(DAG.getDataLayout());
13724
13725   if (Op.getSimpleValueType().isVector())
13726     return lowerUINT_TO_FP_vec(Op, DAG);
13727
13728   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
13729   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
13730   // the optimization here.
13731   if (DAG.SignBitIsZero(N0))
13732     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
13733
13734   MVT SrcVT = N0.getSimpleValueType();
13735   MVT DstVT = Op.getSimpleValueType();
13736
13737   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
13738       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
13739     // Conversions from unsigned i32 to f32/f64 are legal,
13740     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
13741     return Op;
13742   }
13743
13744   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
13745     return LowerUINT_TO_FP_i64(Op, DAG);
13746   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
13747     return LowerUINT_TO_FP_i32(Op, DAG);
13748   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
13749     return SDValue();
13750
13751   // Make a 64-bit buffer, and use it to build an FILD.
13752   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
13753   if (SrcVT == MVT::i32) {
13754     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
13755     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13756                                   StackSlot, MachinePointerInfo());
13757     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
13758                                   OffsetSlot, MachinePointerInfo());
13759     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
13760     return Fild;
13761   }
13762
13763   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
13764   SDValue ValueToStore = Op.getOperand(0);
13765   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
13766     // Bitcasting to f64 here allows us to do a single 64-bit store from
13767     // an SSE register, avoiding the store forwarding penalty that would come
13768     // with two 32-bit stores.
13769     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13770   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
13771                                MachinePointerInfo());
13772   // For i64 source, we need to add the appropriate power of 2 if the input
13773   // was negative.  This is the same as the optimization in
13774   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
13775   // we must be careful to do the computation in x87 extended precision, not
13776   // in SSE. (The generic code can't know it's OK to do this, or how to.)
13777   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
13778   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13779       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13780       MachineMemOperand::MOLoad, 8, 8);
13781
13782   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
13783   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
13784   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
13785                                          MVT::i64, MMO);
13786
13787   APInt FF(32, 0x5F800000ULL);
13788
13789   // Check whether the sign bit is set.
13790   SDValue SignSet = DAG.getSetCC(
13791       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
13792       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
13793
13794   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
13795   SDValue FudgePtr = DAG.getConstantPool(
13796       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
13797
13798   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
13799   SDValue Zero = DAG.getIntPtrConstant(0, dl);
13800   SDValue Four = DAG.getIntPtrConstant(4, dl);
13801   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
13802                                Zero, Four);
13803   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
13804
13805   // Load the value out, extending it from f32 to f80.
13806   // FIXME: Avoid the extend by constructing the right constant pool?
13807   SDValue Fudge = DAG.getExtLoad(
13808       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
13809       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
13810       /* Alignment = */ 4);
13811   // Extend everything to 80 bits to force it to be done on x87.
13812   // TODO: Are there any fast-math-flags to propagate here?
13813   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
13814   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
13815                      DAG.getIntPtrConstant(0, dl));
13816 }
13817
13818 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
13819 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
13820 // just return an <SDValue(), SDValue()> pair.
13821 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
13822 // to i16, i32 or i64, and we lower it to a legal sequence.
13823 // If lowered to the final integer result we return a <result, SDValue()> pair.
13824 // Otherwise we lower it to a sequence ending with a FIST, return a
13825 // <FIST, StackSlot> pair, and the caller is responsible for loading
13826 // the final integer result from StackSlot.
13827 std::pair<SDValue,SDValue>
13828 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
13829                                    bool IsSigned, bool IsReplace) const {
13830   SDLoc DL(Op);
13831
13832   EVT DstTy = Op.getValueType();
13833   EVT TheVT = Op.getOperand(0).getValueType();
13834   auto PtrVT = getPointerTy(DAG.getDataLayout());
13835
13836   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
13837     // f16 must be promoted before using the lowering in this routine.
13838     // fp128 does not use this lowering.
13839     return std::make_pair(SDValue(), SDValue());
13840   }
13841
13842   // If using FIST to compute an unsigned i64, we'll need some fixup
13843   // to handle values above the maximum signed i64.  A FIST is always
13844   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
13845   bool UnsignedFixup = !IsSigned &&
13846                        DstTy == MVT::i64 &&
13847                        (!Subtarget.is64Bit() ||
13848                         !isScalarFPTypeInSSEReg(TheVT));
13849
13850   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
13851     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
13852     // The low 32 bits of the fist result will have the correct uint32 result.
13853     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
13854     DstTy = MVT::i64;
13855   }
13856
13857   assert(DstTy.getSimpleVT() <= MVT::i64 &&
13858          DstTy.getSimpleVT() >= MVT::i16 &&
13859          "Unknown FP_TO_INT to lower!");
13860
13861   // These are really Legal.
13862   if (DstTy == MVT::i32 &&
13863       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13864     return std::make_pair(SDValue(), SDValue());
13865   if (Subtarget.is64Bit() &&
13866       DstTy == MVT::i64 &&
13867       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13868     return std::make_pair(SDValue(), SDValue());
13869
13870   // We lower FP->int64 into FISTP64 followed by a load from a temporary
13871   // stack slot.
13872   MachineFunction &MF = DAG.getMachineFunction();
13873   unsigned MemSize = DstTy.getSizeInBits()/8;
13874   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13875   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13876
13877   unsigned Opc;
13878   switch (DstTy.getSimpleVT().SimpleTy) {
13879   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
13880   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
13881   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
13882   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
13883   }
13884
13885   SDValue Chain = DAG.getEntryNode();
13886   SDValue Value = Op.getOperand(0);
13887   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
13888
13889   if (UnsignedFixup) {
13890     //
13891     // Conversion to unsigned i64 is implemented with a select,
13892     // depending on whether the source value fits in the range
13893     // of a signed i64.  Let Thresh be the FP equivalent of
13894     // 0x8000000000000000ULL.
13895     //
13896     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
13897     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
13898     //  Fist-to-mem64 FistSrc
13899     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
13900     //  to XOR'ing the high 32 bits with Adjust.
13901     //
13902     // Being a power of 2, Thresh is exactly representable in all FP formats.
13903     // For X87 we'd like to use the smallest FP type for this constant, but
13904     // for DAG type consistency we have to match the FP operand type.
13905
13906     APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
13907     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
13908     bool LosesInfo = false;
13909     if (TheVT == MVT::f64)
13910       // The rounding mode is irrelevant as the conversion should be exact.
13911       Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
13912                               &LosesInfo);
13913     else if (TheVT == MVT::f80)
13914       Status = Thresh.convert(APFloat::x87DoubleExtended,
13915                               APFloat::rmNearestTiesToEven, &LosesInfo);
13916
13917     assert(Status == APFloat::opOK && !LosesInfo &&
13918            "FP conversion should have been exact");
13919
13920     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
13921
13922     SDValue Cmp = DAG.getSetCC(DL,
13923                                getSetCCResultType(DAG.getDataLayout(),
13924                                                   *DAG.getContext(), TheVT),
13925                                Value, ThreshVal, ISD::SETLT);
13926     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
13927                            DAG.getConstant(0, DL, MVT::i32),
13928                            DAG.getConstant(0x80000000, DL, MVT::i32));
13929     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
13930     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
13931                                               *DAG.getContext(), TheVT),
13932                        Value, ThreshVal, ISD::SETLT);
13933     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
13934   }
13935
13936   // FIXME This causes a redundant load/store if the SSE-class value is already
13937   // in memory, such as if it is on the callstack.
13938   if (isScalarFPTypeInSSEReg(TheVT)) {
13939     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
13940     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
13941                          MachinePointerInfo::getFixedStack(MF, SSFI));
13942     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
13943     SDValue Ops[] = {
13944       Chain, StackSlot, DAG.getValueType(TheVT)
13945     };
13946
13947     MachineMemOperand *MMO =
13948         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13949                                 MachineMemOperand::MOLoad, MemSize, MemSize);
13950     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
13951     Chain = Value.getValue(1);
13952     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13953     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13954   }
13955
13956   MachineMemOperand *MMO =
13957       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13958                               MachineMemOperand::MOStore, MemSize, MemSize);
13959
13960   if (UnsignedFixup) {
13961
13962     // Insert the FIST, load its result as two i32's,
13963     // and XOR the high i32 with Adjust.
13964
13965     SDValue FistOps[] = { Chain, Value, StackSlot };
13966     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13967                                            FistOps, DstTy, MMO);
13968
13969     SDValue Low32 =
13970         DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
13971     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
13972
13973     SDValue High32 =
13974         DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
13975     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
13976
13977     if (Subtarget.is64Bit()) {
13978       // Join High32 and Low32 into a 64-bit result.
13979       // (High32 << 32) | Low32
13980       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
13981       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
13982       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
13983                            DAG.getConstant(32, DL, MVT::i8));
13984       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
13985       return std::make_pair(Result, SDValue());
13986     }
13987
13988     SDValue ResultOps[] = { Low32, High32 };
13989
13990     SDValue pair = IsReplace
13991       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
13992       : DAG.getMergeValues(ResultOps, DL);
13993     return std::make_pair(pair, SDValue());
13994   } else {
13995     // Build the FP_TO_INT*_IN_MEM
13996     SDValue Ops[] = { Chain, Value, StackSlot };
13997     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13998                                            Ops, DstTy, MMO);
13999     return std::make_pair(FIST, StackSlot);
14000   }
14001 }
14002
14003 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14004                               const X86Subtarget &Subtarget) {
14005   MVT VT = Op->getSimpleValueType(0);
14006   SDValue In = Op->getOperand(0);
14007   MVT InVT = In.getSimpleValueType();
14008   SDLoc dl(Op);
14009
14010   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
14011     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
14012
14013   // Optimize vectors in AVX mode:
14014   //
14015   //   v8i16 -> v8i32
14016   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14017   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14018   //   Concat upper and lower parts.
14019   //
14020   //   v4i32 -> v4i64
14021   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14022   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14023   //   Concat upper and lower parts.
14024   //
14025
14026   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14027       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14028       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14029     return SDValue();
14030
14031   if (Subtarget.hasInt256())
14032     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14033
14034   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14035   SDValue Undef = DAG.getUNDEF(InVT);
14036   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14037   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14038   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14039
14040   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14041                              VT.getVectorNumElements()/2);
14042
14043   OpLo = DAG.getBitcast(HVT, OpLo);
14044   OpHi = DAG.getBitcast(HVT, OpHi);
14045
14046   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14047 }
14048
14049 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14050                   const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14051   MVT VT = Op->getSimpleValueType(0);
14052   SDValue In = Op->getOperand(0);
14053   MVT InVT = In.getSimpleValueType();
14054   SDLoc DL(Op);
14055   unsigned int NumElts = VT.getVectorNumElements();
14056   if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
14057     return SDValue();
14058
14059   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14060     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14061
14062   assert(InVT.getVectorElementType() == MVT::i1);
14063
14064   // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
14065   MVT ExtVT = VT;
14066   if (!VT.is512BitVector() && !Subtarget.hasVLX())
14067     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14068
14069   SDValue One =
14070    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
14071   SDValue Zero =
14072    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
14073
14074   SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
14075   if (VT == ExtVT)
14076     return SelectedVal;
14077   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
14078 }
14079
14080 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14081                                SelectionDAG &DAG) {
14082   if (Subtarget.hasFp256())
14083     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14084       return Res;
14085
14086   return SDValue();
14087 }
14088
14089 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14090                                 SelectionDAG &DAG) {
14091   SDLoc DL(Op);
14092   MVT VT = Op.getSimpleValueType();
14093   SDValue In = Op.getOperand(0);
14094   MVT SVT = In.getSimpleValueType();
14095
14096   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14097     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
14098
14099   if (Subtarget.hasFp256())
14100     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14101       return Res;
14102
14103   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14104          VT.getVectorNumElements() != SVT.getVectorNumElements());
14105   return SDValue();
14106 }
14107
14108 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
14109                                   const X86Subtarget &Subtarget) {
14110
14111   SDLoc DL(Op);
14112   MVT VT = Op.getSimpleValueType();
14113   SDValue In = Op.getOperand(0);
14114   MVT InVT = In.getSimpleValueType();
14115
14116   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
14117
14118   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
14119   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
14120   if (InVT.getScalarSizeInBits() <= 16) {
14121     if (Subtarget.hasBWI()) {
14122       // legal, will go to VPMOVB2M, VPMOVW2M
14123       // Shift packed bytes not supported natively, bitcast to word
14124       MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
14125       SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
14126                                        DAG.getBitcast(ExtVT, In),
14127                                        DAG.getConstant(ShiftInx, DL, ExtVT));
14128       ShiftNode = DAG.getBitcast(InVT, ShiftNode);
14129       return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
14130     }
14131     // Use TESTD/Q, extended vector to packed dword/qword.
14132     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
14133            "Unexpected vector type.");
14134     unsigned NumElts = InVT.getVectorNumElements();
14135     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14136     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14137     InVT = ExtVT;
14138     ShiftInx = InVT.getScalarSizeInBits() - 1;
14139   }
14140
14141   SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
14142                                    DAG.getConstant(ShiftInx, DL, InVT));
14143   return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
14144 }
14145
14146 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14147   SDLoc DL(Op);
14148   MVT VT = Op.getSimpleValueType();
14149   SDValue In = Op.getOperand(0);
14150   MVT InVT = In.getSimpleValueType();
14151
14152   if (VT == MVT::i1) {
14153     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14154            "Invalid scalar TRUNCATE operation");
14155     if (InVT.getSizeInBits() >= 32)
14156       return SDValue();
14157     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14158     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14159   }
14160   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14161          "Invalid TRUNCATE operation");
14162
14163   if (VT.getVectorElementType() == MVT::i1)
14164     return LowerTruncateVecI1(Op, DAG, Subtarget);
14165
14166   // vpmovqb/w/d, vpmovdb/w, vpmovwb
14167   if (Subtarget.hasAVX512()) {
14168     // word to byte only under BWI
14169     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
14170       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
14171                          DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
14172     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14173   }
14174   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14175     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14176     if (Subtarget.hasInt256()) {
14177       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14178       In = DAG.getBitcast(MVT::v8i32, In);
14179       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14180                                 ShufMask);
14181       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14182                          DAG.getIntPtrConstant(0, DL));
14183     }
14184
14185     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14186                                DAG.getIntPtrConstant(0, DL));
14187     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14188                                DAG.getIntPtrConstant(2, DL));
14189     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14190     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14191     static const int ShufMask[] = {0, 2, 4, 6};
14192     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14193   }
14194
14195   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14196     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14197     if (Subtarget.hasInt256()) {
14198       In = DAG.getBitcast(MVT::v32i8, In);
14199
14200       SmallVector<SDValue,32> pshufbMask;
14201       for (unsigned i = 0; i < 2; ++i) {
14202         pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
14203         pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
14204         pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
14205         pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
14206         pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
14207         pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
14208         pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
14209         pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
14210         for (unsigned j = 0; j < 8; ++j)
14211           pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
14212       }
14213       SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
14214       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14215       In = DAG.getBitcast(MVT::v4i64, In);
14216
14217       static const int ShufMask[] = {0,  2,  -1,  -1};
14218       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14219                                 ShufMask);
14220       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14221                        DAG.getIntPtrConstant(0, DL));
14222       return DAG.getBitcast(VT, In);
14223     }
14224
14225     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14226                                DAG.getIntPtrConstant(0, DL));
14227
14228     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14229                                DAG.getIntPtrConstant(4, DL));
14230
14231     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
14232     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
14233
14234     // The PSHUFB mask:
14235     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14236                                    -1, -1, -1, -1, -1, -1, -1, -1};
14237
14238     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14239     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14240     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14241
14242     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14243     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14244
14245     // The MOVLHPS Mask:
14246     static const int ShufMask2[] = {0, 1, 4, 5};
14247     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14248     return DAG.getBitcast(MVT::v8i16, res);
14249   }
14250
14251   // Handle truncation of V256 to V128 using shuffles.
14252   if (!VT.is128BitVector() || !InVT.is256BitVector())
14253     return SDValue();
14254
14255   assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
14256
14257   unsigned NumElems = VT.getVectorNumElements();
14258   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14259
14260   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14261   // Prepare truncation shuffle mask
14262   for (unsigned i = 0; i != NumElems; ++i)
14263     MaskVec[i] = i * 2;
14264   SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
14265                                    DAG.getUNDEF(NVT), MaskVec);
14266   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14267                      DAG.getIntPtrConstant(0, DL));
14268 }
14269
14270 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14271                                            SelectionDAG &DAG) const {
14272   assert(!Op.getSimpleValueType().isVector());
14273
14274   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14275     /*IsSigned=*/ true, /*IsReplace=*/ false);
14276   SDValue FIST = Vals.first, StackSlot = Vals.second;
14277   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14278   if (!FIST.getNode())
14279     return Op;
14280
14281   if (StackSlot.getNode())
14282     // Load the result.
14283     return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
14284                        MachinePointerInfo());
14285
14286   // The node is the result.
14287   return FIST;
14288 }
14289
14290 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14291                                            SelectionDAG &DAG) const {
14292   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14293     /*IsSigned=*/ false, /*IsReplace=*/ false);
14294   SDValue FIST = Vals.first, StackSlot = Vals.second;
14295   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14296   if (!FIST.getNode())
14297     return Op;
14298
14299   if (StackSlot.getNode())
14300     // Load the result.
14301     return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
14302                        MachinePointerInfo());
14303
14304   // The node is the result.
14305   return FIST;
14306 }
14307
14308 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14309   SDLoc DL(Op);
14310   MVT VT = Op.getSimpleValueType();
14311   SDValue In = Op.getOperand(0);
14312   MVT SVT = In.getSimpleValueType();
14313
14314   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14315
14316   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14317                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14318                                  In, DAG.getUNDEF(SVT)));
14319 }
14320
14321 /// The only differences between FABS and FNEG are the mask and the logic op.
14322 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14323 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14324   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14325          "Wrong opcode for lowering FABS or FNEG.");
14326
14327   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14328
14329   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14330   // into an FNABS. We'll lower the FABS after that if it is still in use.
14331   if (IsFABS)
14332     for (SDNode *User : Op->uses())
14333       if (User->getOpcode() == ISD::FNEG)
14334         return Op;
14335
14336   SDLoc dl(Op);
14337   MVT VT = Op.getSimpleValueType();
14338
14339   bool IsF128 = (VT == MVT::f128);
14340
14341   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14342   // decide if we should generate a 16-byte constant mask when we only need 4 or
14343   // 8 bytes for the scalar case.
14344
14345   MVT LogicVT;
14346   MVT EltVT;
14347   unsigned NumElts;
14348
14349   if (VT.isVector()) {
14350     LogicVT = VT;
14351     EltVT = VT.getVectorElementType();
14352     NumElts = VT.getVectorNumElements();
14353   } else if (IsF128) {
14354     // SSE instructions are used for optimized f128 logical operations.
14355     LogicVT = MVT::f128;
14356     EltVT = VT;
14357     NumElts = 1;
14358   } else {
14359     // There are no scalar bitwise logical SSE/AVX instructions, so we
14360     // generate a 16-byte vector constant and logic op even for the scalar case.
14361     // Using a 16-byte mask allows folding the load of the mask with
14362     // the logic op, so it can save (~4 bytes) on code size.
14363     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
14364     EltVT = VT;
14365     NumElts = (VT == MVT::f64) ? 2 : 4;
14366   }
14367
14368   unsigned EltBits = EltVT.getSizeInBits();
14369   LLVMContext *Context = DAG.getContext();
14370   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14371   APInt MaskElt =
14372     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14373   Constant *C = ConstantInt::get(*Context, MaskElt);
14374   C = ConstantVector::getSplat(NumElts, C);
14375   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14376   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
14377   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14378   SDValue Mask = DAG.getLoad(
14379       LogicVT, dl, DAG.getEntryNode(), CPIdx,
14380       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
14381
14382   SDValue Op0 = Op.getOperand(0);
14383   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14384   unsigned LogicOp =
14385     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14386   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14387
14388   if (VT.isVector() || IsF128)
14389     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14390
14391   // For the scalar case extend to a 128-bit vector, perform the logic op,
14392   // and extract the scalar result back out.
14393   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
14394   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14395   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
14396                      DAG.getIntPtrConstant(0, dl));
14397 }
14398
14399 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14400   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14401   LLVMContext *Context = DAG.getContext();
14402   SDValue Op0 = Op.getOperand(0);
14403   SDValue Op1 = Op.getOperand(1);
14404   SDLoc dl(Op);
14405   MVT VT = Op.getSimpleValueType();
14406   MVT SrcVT = Op1.getSimpleValueType();
14407   bool IsF128 = (VT == MVT::f128);
14408
14409   // If second operand is smaller, extend it first.
14410   if (SrcVT.bitsLT(VT)) {
14411     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14412     SrcVT = VT;
14413   }
14414   // And if it is bigger, shrink it first.
14415   if (SrcVT.bitsGT(VT)) {
14416     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
14417     SrcVT = VT;
14418   }
14419
14420   // At this point the operands and the result should have the same
14421   // type, and that won't be f80 since that is not custom lowered.
14422   assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
14423          "Unexpected type in LowerFCOPYSIGN");
14424
14425   const fltSemantics &Sem =
14426       VT == MVT::f64 ? APFloat::IEEEdouble :
14427           (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
14428   const unsigned SizeInBits = VT.getSizeInBits();
14429
14430   SmallVector<Constant *, 4> CV(
14431       VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
14432       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14433
14434   // First, clear all bits but the sign bit from the second operand (sign).
14435   CV[0] = ConstantFP::get(*Context,
14436                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14437   Constant *C = ConstantVector::get(CV);
14438   auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
14439   SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14440
14441   // Perform all logic operations as 16-byte vectors because there are no
14442   // scalar FP logic instructions in SSE. This allows load folding of the
14443   // constants into the logic instructions.
14444   MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
14445   SDValue Mask1 =
14446       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14447                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14448                   /* Alignment = */ 16);
14449   if (!IsF128)
14450     Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
14451   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
14452
14453   // Next, clear the sign bit from the first operand (magnitude).
14454   // If it's a constant, we can clear it here.
14455   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14456     APFloat APF = Op0CN->getValueAPF();
14457     // If the magnitude is a positive zero, the sign bit alone is enough.
14458     if (APF.isPosZero())
14459       return IsF128 ? SignBit :
14460           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
14461                       DAG.getIntPtrConstant(0, dl));
14462     APF.clearSign();
14463     CV[0] = ConstantFP::get(*Context, APF);
14464   } else {
14465     CV[0] = ConstantFP::get(
14466         *Context,
14467         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14468   }
14469   C = ConstantVector::get(CV);
14470   CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14471   SDValue Val =
14472       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14473                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14474                   /* Alignment = */ 16);
14475   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14476   if (!isa<ConstantFPSDNode>(Op0)) {
14477     if (!IsF128)
14478       Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
14479     Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
14480   }
14481   // OR the magnitude value with the sign bit.
14482   Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
14483   return IsF128 ? Val :
14484       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
14485                   DAG.getIntPtrConstant(0, dl));
14486 }
14487
14488 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14489   SDValue N0 = Op.getOperand(0);
14490   SDLoc dl(Op);
14491   MVT VT = Op.getSimpleValueType();
14492
14493   MVT OpVT = N0.getSimpleValueType();
14494   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
14495          "Unexpected type for FGETSIGN");
14496
14497   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
14498   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
14499   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
14500   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
14501   Res = DAG.getZExtOrTrunc(Res, dl, VT);
14502   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
14503   return Res;
14504 }
14505
14506 // Check whether an OR'd tree is PTEST-able.
14507 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
14508                                       SelectionDAG &DAG) {
14509   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14510
14511   if (!Subtarget.hasSSE41())
14512     return SDValue();
14513
14514   if (!Op->hasOneUse())
14515     return SDValue();
14516
14517   SDNode *N = Op.getNode();
14518   SDLoc DL(N);
14519
14520   SmallVector<SDValue, 8> Opnds;
14521   DenseMap<SDValue, unsigned> VecInMap;
14522   SmallVector<SDValue, 8> VecIns;
14523   EVT VT = MVT::Other;
14524
14525   // Recognize a special case where a vector is casted into wide integer to
14526   // test all 0s.
14527   Opnds.push_back(N->getOperand(0));
14528   Opnds.push_back(N->getOperand(1));
14529
14530   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14531     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14532     // BFS traverse all OR'd operands.
14533     if (I->getOpcode() == ISD::OR) {
14534       Opnds.push_back(I->getOperand(0));
14535       Opnds.push_back(I->getOperand(1));
14536       // Re-evaluate the number of nodes to be traversed.
14537       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14538       continue;
14539     }
14540
14541     // Quit if a non-EXTRACT_VECTOR_ELT
14542     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14543       return SDValue();
14544
14545     // Quit if without a constant index.
14546     SDValue Idx = I->getOperand(1);
14547     if (!isa<ConstantSDNode>(Idx))
14548       return SDValue();
14549
14550     SDValue ExtractedFromVec = I->getOperand(0);
14551     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14552     if (M == VecInMap.end()) {
14553       VT = ExtractedFromVec.getValueType();
14554       // Quit if not 128/256-bit vector.
14555       if (!VT.is128BitVector() && !VT.is256BitVector())
14556         return SDValue();
14557       // Quit if not the same type.
14558       if (VecInMap.begin() != VecInMap.end() &&
14559           VT != VecInMap.begin()->first.getValueType())
14560         return SDValue();
14561       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14562       VecIns.push_back(ExtractedFromVec);
14563     }
14564     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14565   }
14566
14567   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14568          "Not extracted from 128-/256-bit vector.");
14569
14570   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14571
14572   for (DenseMap<SDValue, unsigned>::const_iterator
14573         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14574     // Quit if not all elements are used.
14575     if (I->second != FullMask)
14576       return SDValue();
14577   }
14578
14579   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14580
14581   // Cast all vectors into TestVT for PTEST.
14582   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14583     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
14584
14585   // If more than one full vectors are evaluated, OR them first before PTEST.
14586   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14587     // Each iteration will OR 2 nodes and append the result until there is only
14588     // 1 node left, i.e. the final OR'd value of all vectors.
14589     SDValue LHS = VecIns[Slot];
14590     SDValue RHS = VecIns[Slot + 1];
14591     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14592   }
14593
14594   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14595                      VecIns.back(), VecIns.back());
14596 }
14597
14598 /// \brief return true if \c Op has a use that doesn't just read flags.
14599 static bool hasNonFlagsUse(SDValue Op) {
14600   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14601        ++UI) {
14602     SDNode *User = *UI;
14603     unsigned UOpNo = UI.getOperandNo();
14604     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14605       // Look pass truncate.
14606       UOpNo = User->use_begin().getOperandNo();
14607       User = *User->use_begin();
14608     }
14609
14610     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14611         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14612       return true;
14613   }
14614   return false;
14615 }
14616
14617 // Emit KTEST instruction for bit vectors on AVX-512
14618 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
14619                          const X86Subtarget &Subtarget) {
14620   if (Op.getOpcode() == ISD::BITCAST) {
14621     auto hasKTEST = [&](MVT VT) {
14622       unsigned SizeInBits = VT.getSizeInBits();
14623       return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
14624         (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
14625     };
14626     SDValue Op0 = Op.getOperand(0);
14627     MVT Op0VT = Op0.getValueType().getSimpleVT();
14628     if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
14629         hasKTEST(Op0VT))
14630       return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
14631   }
14632   return SDValue();
14633 }
14634
14635 /// Emit nodes that will be selected as "test Op0,Op0", or something
14636 /// equivalent.
14637 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
14638                                     SelectionDAG &DAG) const {
14639   if (Op.getValueType() == MVT::i1) {
14640     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
14641     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
14642                        DAG.getConstant(0, dl, MVT::i8));
14643   }
14644   // CF and OF aren't always set the way we want. Determine which
14645   // of these we need.
14646   bool NeedCF = false;
14647   bool NeedOF = false;
14648   switch (X86CC) {
14649   default: break;
14650   case X86::COND_A: case X86::COND_AE:
14651   case X86::COND_B: case X86::COND_BE:
14652     NeedCF = true;
14653     break;
14654   case X86::COND_G: case X86::COND_GE:
14655   case X86::COND_L: case X86::COND_LE:
14656   case X86::COND_O: case X86::COND_NO: {
14657     // Check if we really need to set the
14658     // Overflow flag. If NoSignedWrap is present
14659     // that is not actually needed.
14660     switch (Op->getOpcode()) {
14661     case ISD::ADD:
14662     case ISD::SUB:
14663     case ISD::MUL:
14664     case ISD::SHL: {
14665       const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
14666       if (BinNode->Flags.hasNoSignedWrap())
14667         break;
14668     }
14669     default:
14670       NeedOF = true;
14671       break;
14672     }
14673     break;
14674   }
14675   }
14676   // See if we can use the EFLAGS value from the operand instead of
14677   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14678   // we prove that the arithmetic won't overflow, we can't use OF or CF.
14679   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14680     // Emit KTEST for bit vectors
14681     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14682       return Node;
14683     // Emit a CMP with 0, which is the TEST pattern.
14684     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14685                        DAG.getConstant(0, dl, Op.getValueType()));
14686   }
14687   unsigned Opcode = 0;
14688   unsigned NumOperands = 0;
14689
14690   // Truncate operations may prevent the merge of the SETCC instruction
14691   // and the arithmetic instruction before it. Attempt to truncate the operands
14692   // of the arithmetic instruction and use a reduced bit-width instruction.
14693   bool NeedTruncation = false;
14694   SDValue ArithOp = Op;
14695   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14696     SDValue Arith = Op->getOperand(0);
14697     // Both the trunc and the arithmetic op need to have one user each.
14698     if (Arith->hasOneUse())
14699       switch (Arith.getOpcode()) {
14700         default: break;
14701         case ISD::ADD:
14702         case ISD::SUB:
14703         case ISD::AND:
14704         case ISD::OR:
14705         case ISD::XOR: {
14706           NeedTruncation = true;
14707           ArithOp = Arith;
14708         }
14709       }
14710   }
14711
14712   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14713   // which may be the result of a CAST.  We use the variable 'Op', which is the
14714   // non-casted variable when we check for possible users.
14715   switch (ArithOp.getOpcode()) {
14716   case ISD::ADD:
14717     // Due to an isel shortcoming, be conservative if this add is likely to be
14718     // selected as part of a load-modify-store instruction. When the root node
14719     // in a match is a store, isel doesn't know how to remap non-chain non-flag
14720     // uses of other nodes in the match, such as the ADD in this case. This
14721     // leads to the ADD being left around and reselected, with the result being
14722     // two adds in the output.  Alas, even if none our users are stores, that
14723     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
14724     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
14725     // climbing the DAG back to the root, and it doesn't seem to be worth the
14726     // effort.
14727     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14728          UE = Op.getNode()->use_end(); UI != UE; ++UI)
14729       if (UI->getOpcode() != ISD::CopyToReg &&
14730           UI->getOpcode() != ISD::SETCC &&
14731           UI->getOpcode() != ISD::STORE)
14732         goto default_case;
14733
14734     if (ConstantSDNode *C =
14735         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14736       // An add of one will be selected as an INC.
14737       if (C->isOne() && !Subtarget.slowIncDec()) {
14738         Opcode = X86ISD::INC;
14739         NumOperands = 1;
14740         break;
14741       }
14742
14743       // An add of negative one (subtract of one) will be selected as a DEC.
14744       if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
14745         Opcode = X86ISD::DEC;
14746         NumOperands = 1;
14747         break;
14748       }
14749     }
14750
14751     // Otherwise use a regular EFLAGS-setting add.
14752     Opcode = X86ISD::ADD;
14753     NumOperands = 2;
14754     break;
14755   case ISD::SHL:
14756   case ISD::SRL:
14757     // If we have a constant logical shift that's only used in a comparison
14758     // against zero turn it into an equivalent AND. This allows turning it into
14759     // a TEST instruction later.
14760     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14761         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14762       EVT VT = Op.getValueType();
14763       unsigned BitWidth = VT.getSizeInBits();
14764       unsigned ShAmt = Op->getConstantOperandVal(1);
14765       if (ShAmt >= BitWidth) // Avoid undefined shifts.
14766         break;
14767       APInt Mask = ArithOp.getOpcode() == ISD::SRL
14768                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14769                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14770       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14771         break;
14772       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14773                        DAG.getConstant(Mask, dl, VT));
14774     }
14775     break;
14776
14777   case ISD::AND:
14778     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
14779     // because a TEST instruction will be better.
14780     if (!hasNonFlagsUse(Op)) {
14781       SDValue Op0 = ArithOp->getOperand(0);
14782       SDValue Op1 = ArithOp->getOperand(1);
14783       EVT VT = ArithOp.getValueType();
14784       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
14785       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
14786
14787       // But if we can combine this into an ANDN operation, then create an AND
14788       // now and allow it to be pattern matched into an ANDN.
14789       if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
14790         break;
14791     }
14792     // FALL THROUGH
14793   case ISD::SUB:
14794   case ISD::OR:
14795   case ISD::XOR:
14796     // Due to the ISEL shortcoming noted above, be conservative if this op is
14797     // likely to be selected as part of a load-modify-store instruction.
14798     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14799            UE = Op.getNode()->use_end(); UI != UE; ++UI)
14800       if (UI->getOpcode() == ISD::STORE)
14801         goto default_case;
14802
14803     // Otherwise use a regular EFLAGS-setting instruction.
14804     switch (ArithOp.getOpcode()) {
14805     default: llvm_unreachable("unexpected operator!");
14806     case ISD::SUB: Opcode = X86ISD::SUB; break;
14807     case ISD::XOR: Opcode = X86ISD::XOR; break;
14808     case ISD::AND: Opcode = X86ISD::AND; break;
14809     case ISD::OR: {
14810       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14811         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
14812           return EFLAGS;
14813       }
14814       Opcode = X86ISD::OR;
14815       break;
14816     }
14817     }
14818
14819     NumOperands = 2;
14820     break;
14821   case X86ISD::ADD:
14822   case X86ISD::SUB:
14823   case X86ISD::INC:
14824   case X86ISD::DEC:
14825   case X86ISD::OR:
14826   case X86ISD::XOR:
14827   case X86ISD::AND:
14828     return SDValue(Op.getNode(), 1);
14829   default:
14830   default_case:
14831     break;
14832   }
14833
14834   // If we found that truncation is beneficial, perform the truncation and
14835   // update 'Op'.
14836   if (NeedTruncation) {
14837     EVT VT = Op.getValueType();
14838     SDValue WideVal = Op->getOperand(0);
14839     EVT WideVT = WideVal.getValueType();
14840     unsigned ConvertedOp = 0;
14841     // Use a target machine opcode to prevent further DAGCombine
14842     // optimizations that may separate the arithmetic operations
14843     // from the setcc node.
14844     switch (WideVal.getOpcode()) {
14845       default: break;
14846       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
14847       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
14848       case ISD::AND: ConvertedOp = X86ISD::AND; break;
14849       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
14850       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
14851     }
14852
14853     if (ConvertedOp) {
14854       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14855       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
14856         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
14857         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
14858         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
14859       }
14860     }
14861   }
14862
14863   if (Opcode == 0) {
14864     // Emit KTEST for bit vectors
14865     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14866       return Node;
14867
14868     // Emit a CMP with 0, which is the TEST pattern.
14869     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14870                        DAG.getConstant(0, dl, Op.getValueType()));
14871   }
14872   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
14873   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
14874
14875   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
14876   DAG.ReplaceAllUsesWith(Op, New);
14877   return SDValue(New.getNode(), 1);
14878 }
14879
14880 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
14881 /// equivalent.
14882 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
14883                                    const SDLoc &dl, SelectionDAG &DAG) const {
14884   if (isNullConstant(Op1))
14885     return EmitTest(Op0, X86CC, dl, DAG);
14886
14887   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
14888          "Unexpected comparison operation for MVT::i1 operands");
14889
14890   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
14891        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
14892     // Only promote the compare up to I32 if it is a 16 bit operation
14893     // with an immediate.  16 bit immediates are to be avoided.
14894     if ((Op0.getValueType() == MVT::i16 &&
14895          (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
14896         !DAG.getMachineFunction().getFunction()->optForMinSize() &&
14897         !Subtarget.isAtom()) {
14898       unsigned ExtendOp =
14899           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
14900       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
14901       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
14902     }
14903     // Use SUB instead of CMP to enable CSE between SUB and CMP.
14904     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
14905     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
14906                               Op0, Op1);
14907     return SDValue(Sub.getNode(), 1);
14908   }
14909   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
14910 }
14911
14912 /// Convert a comparison if required by the subtarget.
14913 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
14914                                                  SelectionDAG &DAG) const {
14915   // If the subtarget does not support the FUCOMI instruction, floating-point
14916   // comparisons have to be converted.
14917   if (Subtarget.hasCMov() ||
14918       Cmp.getOpcode() != X86ISD::CMP ||
14919       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
14920       !Cmp.getOperand(1).getValueType().isFloatingPoint())
14921     return Cmp;
14922
14923   // The instruction selector will select an FUCOM instruction instead of
14924   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
14925   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
14926   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
14927   SDLoc dl(Cmp);
14928   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
14929   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
14930   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
14931                             DAG.getConstant(8, dl, MVT::i8));
14932   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
14933
14934   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
14935   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
14936   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
14937 }
14938
14939 /// The minimum architected relative accuracy is 2^-12. We need one
14940 /// Newton-Raphson step to have a good float result (24 bits of precision).
14941 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
14942                                             DAGCombinerInfo &DCI,
14943                                             unsigned &RefinementSteps,
14944                                             bool &UseOneConstNR) const {
14945   EVT VT = Op.getValueType();
14946   const char *RecipOp;
14947
14948   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
14949   // TODO: Add support for AVX512 (v16f32).
14950   // It is likely not profitable to do this for f64 because a double-precision
14951   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
14952   // instructions: convert to single, rsqrtss, convert back to double, refine
14953   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
14954   // along with FMA, this could be a throughput win.
14955   if (VT == MVT::f32 && Subtarget.hasSSE1())
14956     RecipOp = "sqrtf";
14957   else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14958            (VT == MVT::v8f32 && Subtarget.hasAVX()))
14959     RecipOp = "vec-sqrtf";
14960   else
14961     return SDValue();
14962
14963   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14964   if (!Recips.isEnabled(RecipOp))
14965     return SDValue();
14966
14967   RefinementSteps = Recips.getRefinementSteps(RecipOp);
14968   UseOneConstNR = false;
14969   return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
14970 }
14971
14972 /// The minimum architected relative accuracy is 2^-12. We need one
14973 /// Newton-Raphson step to have a good float result (24 bits of precision).
14974 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
14975                                             DAGCombinerInfo &DCI,
14976                                             unsigned &RefinementSteps) const {
14977   EVT VT = Op.getValueType();
14978   const char *RecipOp;
14979
14980   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
14981   // TODO: Add support for AVX512 (v16f32).
14982   // It is likely not profitable to do this for f64 because a double-precision
14983   // reciprocal estimate with refinement on x86 prior to FMA requires
14984   // 15 instructions: convert to single, rcpss, convert back to double, refine
14985   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
14986   // along with FMA, this could be a throughput win.
14987   if (VT == MVT::f32 && Subtarget.hasSSE1())
14988     RecipOp = "divf";
14989   else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14990            (VT == MVT::v8f32 && Subtarget.hasAVX()))
14991     RecipOp = "vec-divf";
14992   else
14993     return SDValue();
14994
14995   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14996   if (!Recips.isEnabled(RecipOp))
14997     return SDValue();
14998
14999   RefinementSteps = Recips.getRefinementSteps(RecipOp);
15000   return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15001 }
15002
15003 /// If we have at least two divisions that use the same divisor, convert to
15004 /// multplication by a reciprocal. This may need to be adjusted for a given
15005 /// CPU if a division's cost is not at least twice the cost of a multiplication.
15006 /// This is because we still need one division to calculate the reciprocal and
15007 /// then we need two multiplies by that reciprocal as replacements for the
15008 /// original divisions.
15009 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
15010   return 2;
15011 }
15012
15013 /// Result of 'and' is compared against zero. Change to a BT node if possible.
15014 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15015                                      const SDLoc &dl, SelectionDAG &DAG) const {
15016   SDValue Op0 = And.getOperand(0);
15017   SDValue Op1 = And.getOperand(1);
15018   if (Op0.getOpcode() == ISD::TRUNCATE)
15019     Op0 = Op0.getOperand(0);
15020   if (Op1.getOpcode() == ISD::TRUNCATE)
15021     Op1 = Op1.getOperand(0);
15022
15023   SDValue LHS, RHS;
15024   if (Op1.getOpcode() == ISD::SHL)
15025     std::swap(Op0, Op1);
15026   if (Op0.getOpcode() == ISD::SHL) {
15027     if (isOneConstant(Op0.getOperand(0))) {
15028       // If we looked past a truncate, check that it's only truncating away
15029       // known zeros.
15030       unsigned BitWidth = Op0.getValueSizeInBits();
15031       unsigned AndBitWidth = And.getValueSizeInBits();
15032       if (BitWidth > AndBitWidth) {
15033         APInt Zeros, Ones;
15034         DAG.computeKnownBits(Op0, Zeros, Ones);
15035         if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15036           return SDValue();
15037       }
15038       LHS = Op1;
15039       RHS = Op0.getOperand(1);
15040     }
15041   } else if (Op1.getOpcode() == ISD::Constant) {
15042     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15043     uint64_t AndRHSVal = AndRHS->getZExtValue();
15044     SDValue AndLHS = Op0;
15045
15046     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15047       LHS = AndLHS.getOperand(0);
15048       RHS = AndLHS.getOperand(1);
15049     }
15050
15051     // Use BT if the immediate can't be encoded in a TEST instruction.
15052     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15053       LHS = AndLHS;
15054       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
15055     }
15056   }
15057
15058   if (LHS.getNode()) {
15059     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15060     // instruction.  Since the shift amount is in-range-or-undefined, we know
15061     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15062     // the encoding for the i16 version is larger than the i32 version.
15063     // Also promote i16 to i32 for performance / code size reason.
15064     if (LHS.getValueType() == MVT::i8 ||
15065         LHS.getValueType() == MVT::i16)
15066       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15067
15068     // If the operand types disagree, extend the shift amount to match.  Since
15069     // BT ignores high bits (like shifts) we can use anyextend.
15070     if (LHS.getValueType() != RHS.getValueType())
15071       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15072
15073     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15074     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15075     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15076                        DAG.getConstant(Cond, dl, MVT::i8), BT);
15077   }
15078
15079   return SDValue();
15080 }
15081
15082 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
15083 /// CMPs.
15084 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15085                               SDValue &Op1) {
15086   unsigned SSECC;
15087   bool Swap = false;
15088
15089   // SSE Condition code mapping:
15090   //  0 - EQ
15091   //  1 - LT
15092   //  2 - LE
15093   //  3 - UNORD
15094   //  4 - NEQ
15095   //  5 - NLT
15096   //  6 - NLE
15097   //  7 - ORD
15098   switch (SetCCOpcode) {
15099   default: llvm_unreachable("Unexpected SETCC condition");
15100   case ISD::SETOEQ:
15101   case ISD::SETEQ:  SSECC = 0; break;
15102   case ISD::SETOGT:
15103   case ISD::SETGT:  Swap = true; // Fallthrough
15104   case ISD::SETLT:
15105   case ISD::SETOLT: SSECC = 1; break;
15106   case ISD::SETOGE:
15107   case ISD::SETGE:  Swap = true; // Fallthrough
15108   case ISD::SETLE:
15109   case ISD::SETOLE: SSECC = 2; break;
15110   case ISD::SETUO:  SSECC = 3; break;
15111   case ISD::SETUNE:
15112   case ISD::SETNE:  SSECC = 4; break;
15113   case ISD::SETULE: Swap = true; // Fallthrough
15114   case ISD::SETUGE: SSECC = 5; break;
15115   case ISD::SETULT: Swap = true; // Fallthrough
15116   case ISD::SETUGT: SSECC = 6; break;
15117   case ISD::SETO:   SSECC = 7; break;
15118   case ISD::SETUEQ:
15119   case ISD::SETONE: SSECC = 8; break;
15120   }
15121   if (Swap)
15122     std::swap(Op0, Op1);
15123
15124   return SSECC;
15125 }
15126
15127 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
15128 /// concatenate the result back.
15129 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15130   MVT VT = Op.getSimpleValueType();
15131
15132   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15133          "Unsupported value type for operation");
15134
15135   unsigned NumElems = VT.getVectorNumElements();
15136   SDLoc dl(Op);
15137   SDValue CC = Op.getOperand(2);
15138
15139   // Extract the LHS vectors
15140   SDValue LHS = Op.getOperand(0);
15141   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
15142   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
15143
15144   // Extract the RHS vectors
15145   SDValue RHS = Op.getOperand(1);
15146   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
15147   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
15148
15149   // Issue the operation on the smaller types and concatenate the result back
15150   MVT EltVT = VT.getVectorElementType();
15151   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15152   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15153                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15154                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15155 }
15156
15157 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15158   SDValue Op0 = Op.getOperand(0);
15159   SDValue Op1 = Op.getOperand(1);
15160   SDValue CC = Op.getOperand(2);
15161   MVT VT = Op.getSimpleValueType();
15162   SDLoc dl(Op);
15163
15164   assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15165          "Unexpected type for boolean compare operation");
15166   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15167   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
15168                                DAG.getConstant(-1, dl, VT));
15169   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
15170                                DAG.getConstant(-1, dl, VT));
15171   switch (SetCCOpcode) {
15172   default: llvm_unreachable("Unexpected SETCC condition");
15173   case ISD::SETEQ:
15174     // (x == y) -> ~(x ^ y)
15175     return DAG.getNode(ISD::XOR, dl, VT,
15176                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
15177                        DAG.getConstant(-1, dl, VT));
15178   case ISD::SETNE:
15179     // (x != y) -> (x ^ y)
15180     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
15181   case ISD::SETUGT:
15182   case ISD::SETGT:
15183     // (x > y) -> (x & ~y)
15184     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
15185   case ISD::SETULT:
15186   case ISD::SETLT:
15187     // (x < y) -> (~x & y)
15188     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
15189   case ISD::SETULE:
15190   case ISD::SETLE:
15191     // (x <= y) -> (~x | y)
15192     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
15193   case ISD::SETUGE:
15194   case ISD::SETGE:
15195     // (x >=y) -> (x | ~y)
15196     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
15197   }
15198 }
15199
15200 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15201
15202   SDValue Op0 = Op.getOperand(0);
15203   SDValue Op1 = Op.getOperand(1);
15204   SDValue CC = Op.getOperand(2);
15205   MVT VT = Op.getSimpleValueType();
15206   SDLoc dl(Op);
15207
15208   assert(VT.getVectorElementType() == MVT::i1 &&
15209          "Cannot set masked compare for this operation");
15210
15211   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15212   unsigned  Opc = 0;
15213   bool Unsigned = false;
15214   bool Swap = false;
15215   unsigned SSECC;
15216   switch (SetCCOpcode) {
15217   default: llvm_unreachable("Unexpected SETCC condition");
15218   case ISD::SETNE:  SSECC = 4; break;
15219   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15220   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15221   case ISD::SETLT:  Swap = true; //fall-through
15222   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15223   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15224   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15225   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15226   case ISD::SETULE: Unsigned = true; //fall-through
15227   case ISD::SETLE:  SSECC = 2; break;
15228   }
15229
15230   if (Swap)
15231     std::swap(Op0, Op1);
15232   if (Opc)
15233     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15234   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15235   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15236                      DAG.getConstant(SSECC, dl, MVT::i8));
15237 }
15238
15239 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15240 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15241 /// return an empty value.
15242 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
15243                                       SelectionDAG &DAG) {
15244   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15245   if (!BV)
15246     return SDValue();
15247
15248   MVT VT = Op1.getSimpleValueType();
15249   MVT EVT = VT.getVectorElementType();
15250   unsigned n = VT.getVectorNumElements();
15251   SmallVector<SDValue, 8> ULTOp1;
15252
15253   for (unsigned i = 0; i < n; ++i) {
15254     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15255     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
15256       return SDValue();
15257
15258     // Avoid underflow.
15259     APInt Val = Elt->getAPIntValue();
15260     if (Val == 0)
15261       return SDValue();
15262
15263     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
15264   }
15265
15266   return DAG.getBuildVector(VT, dl, ULTOp1);
15267 }
15268
15269 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
15270                            SelectionDAG &DAG) {
15271   SDValue Op0 = Op.getOperand(0);
15272   SDValue Op1 = Op.getOperand(1);
15273   SDValue CC = Op.getOperand(2);
15274   MVT VT = Op.getSimpleValueType();
15275   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15276   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15277   SDLoc dl(Op);
15278
15279   if (isFP) {
15280 #ifndef NDEBUG
15281     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15282     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15283 #endif
15284
15285     unsigned Opc;
15286     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15287       assert(VT.getVectorNumElements() <= 16);
15288       Opc = X86ISD::CMPM;
15289     } else {
15290       Opc = X86ISD::CMPP;
15291       // The SSE/AVX packed FP comparison nodes are defined with a
15292       // floating-point vector result that matches the operand type. This allows
15293       // them to work with an SSE1 target (integer vector types are not legal).
15294       VT = Op0.getSimpleValueType();
15295     }
15296
15297     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
15298     // emit two comparisons and a logic op to tie them together.
15299     // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
15300     // available.
15301     SDValue Cmp;
15302     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15303     if (SSECC == 8) {
15304       // LLVM predicate is SETUEQ or SETONE.
15305       unsigned CC0, CC1;
15306       unsigned CombineOpc;
15307       if (SetCCOpcode == ISD::SETUEQ) {
15308         CC0 = 3; // UNORD
15309         CC1 = 0; // EQ
15310         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
15311                                            static_cast<unsigned>(ISD::OR);
15312       } else {
15313         assert(SetCCOpcode == ISD::SETONE);
15314         CC0 = 7; // ORD
15315         CC1 = 4; // NEQ
15316         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
15317                                            static_cast<unsigned>(ISD::AND);
15318       }
15319
15320       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15321                                  DAG.getConstant(CC0, dl, MVT::i8));
15322       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15323                                  DAG.getConstant(CC1, dl, MVT::i8));
15324       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15325     } else {
15326       // Handle all other FP comparisons here.
15327       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
15328                         DAG.getConstant(SSECC, dl, MVT::i8));
15329     }
15330
15331     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
15332     // result type of SETCC. The bitcast is expected to be optimized away
15333     // during combining/isel.
15334     if (Opc == X86ISD::CMPP)
15335       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
15336
15337     return Cmp;
15338   }
15339
15340   MVT VTOp0 = Op0.getSimpleValueType();
15341   assert(VTOp0 == Op1.getSimpleValueType() &&
15342          "Expected operands with same type!");
15343   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
15344          "Invalid number of packed elements for source and destination!");
15345
15346   if (VT.is128BitVector() && VTOp0.is256BitVector()) {
15347     // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
15348     // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
15349     // legalizer firstly checks if the first operand in input to the setcc has
15350     // a legal type. If so, then it promotes the return type to that same type.
15351     // Otherwise, the return type is promoted to the 'next legal type' which,
15352     // for a vector of MVT::i1 is always a 128-bit integer vector type.
15353     //
15354     // We reach this code only if the following two conditions are met:
15355     // 1. Both return type and operand type have been promoted to wider types
15356     //    by the type legalizer.
15357     // 2. The original operand type has been promoted to a 256-bit vector.
15358     //
15359     // Note that condition 2. only applies for AVX targets.
15360     SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
15361     return DAG.getZExtOrTrunc(NewOp, dl, VT);
15362   }
15363
15364   // The non-AVX512 code below works under the assumption that source and
15365   // destination types are the same.
15366   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
15367          "Value types for source and destination must be the same!");
15368
15369   // Break 256-bit integer vector compare into smaller ones.
15370   if (VT.is256BitVector() && !Subtarget.hasInt256())
15371     return Lower256IntVSETCC(Op, DAG);
15372
15373   // Operands are boolean (vectors of i1)
15374   MVT OpVT = Op1.getSimpleValueType();
15375   if (OpVT.getVectorElementType() == MVT::i1)
15376     return LowerBoolVSETCC_AVX512(Op, DAG);
15377
15378   // The result is boolean, but operands are int/float
15379   if (VT.getVectorElementType() == MVT::i1) {
15380     // In AVX-512 architecture setcc returns mask with i1 elements,
15381     // But there is no compare instruction for i8 and i16 elements in KNL.
15382     // In this case use SSE compare
15383     bool UseAVX512Inst =
15384       (OpVT.is512BitVector() ||
15385        OpVT.getVectorElementType().getSizeInBits() >= 32 ||
15386        (Subtarget.hasBWI() && Subtarget.hasVLX()));
15387
15388     if (UseAVX512Inst)
15389       return LowerIntVSETCC_AVX512(Op, DAG);
15390
15391     return DAG.getNode(ISD::TRUNCATE, dl, VT,
15392                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15393   }
15394
15395   // Lower using XOP integer comparisons.
15396   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
15397        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
15398     // Translate compare code to XOP PCOM compare mode.
15399     unsigned CmpMode = 0;
15400     switch (SetCCOpcode) {
15401     default: llvm_unreachable("Unexpected SETCC condition");
15402     case ISD::SETULT:
15403     case ISD::SETLT: CmpMode = 0x00; break;
15404     case ISD::SETULE:
15405     case ISD::SETLE: CmpMode = 0x01; break;
15406     case ISD::SETUGT:
15407     case ISD::SETGT: CmpMode = 0x02; break;
15408     case ISD::SETUGE:
15409     case ISD::SETGE: CmpMode = 0x03; break;
15410     case ISD::SETEQ: CmpMode = 0x04; break;
15411     case ISD::SETNE: CmpMode = 0x05; break;
15412     }
15413
15414     // Are we comparing unsigned or signed integers?
15415     unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
15416       ? X86ISD::VPCOMU : X86ISD::VPCOM;
15417
15418     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15419                        DAG.getConstant(CmpMode, dl, MVT::i8));
15420   }
15421
15422   // We are handling one of the integer comparisons here.  Since SSE only has
15423   // GT and EQ comparisons for integer, swapping operands and multiple
15424   // operations may be required for some comparisons.
15425   unsigned Opc;
15426   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15427   bool Subus = false;
15428
15429   switch (SetCCOpcode) {
15430   default: llvm_unreachable("Unexpected SETCC condition");
15431   case ISD::SETNE:  Invert = true;
15432   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15433   case ISD::SETLT:  Swap = true;
15434   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15435   case ISD::SETGE:  Swap = true;
15436   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15437                     Invert = true; break;
15438   case ISD::SETULT: Swap = true;
15439   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15440                     FlipSigns = true; break;
15441   case ISD::SETUGE: Swap = true;
15442   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15443                     FlipSigns = true; Invert = true; break;
15444   }
15445
15446   // Special case: Use min/max operations for SETULE/SETUGE
15447   MVT VET = VT.getVectorElementType();
15448   bool hasMinMax =
15449        (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15450     || (Subtarget.hasSSE2()  && (VET == MVT::i8));
15451
15452   if (hasMinMax) {
15453     switch (SetCCOpcode) {
15454     default: break;
15455     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
15456     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
15457     }
15458
15459     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15460   }
15461
15462   bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15463   if (!MinMax && hasSubus) {
15464     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15465     // Op0 u<= Op1:
15466     //   t = psubus Op0, Op1
15467     //   pcmpeq t, <0..0>
15468     switch (SetCCOpcode) {
15469     default: break;
15470     case ISD::SETULT: {
15471       // If the comparison is against a constant we can turn this into a
15472       // setule.  With psubus, setule does not require a swap.  This is
15473       // beneficial because the constant in the register is no longer
15474       // destructed as the destination so it can be hoisted out of a loop.
15475       // Only do this pre-AVX since vpcmp* is no longer destructive.
15476       if (Subtarget.hasAVX())
15477         break;
15478       if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
15479         Op1 = ULEOp1;
15480         Subus = true; Invert = false; Swap = false;
15481       }
15482       break;
15483     }
15484     // Psubus is better than flip-sign because it requires no inversion.
15485     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15486     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15487     }
15488
15489     if (Subus) {
15490       Opc = X86ISD::SUBUS;
15491       FlipSigns = false;
15492     }
15493   }
15494
15495   if (Swap)
15496     std::swap(Op0, Op1);
15497
15498   // Check that the operation in question is available (most are plain SSE2,
15499   // but PCMPGTQ and PCMPEQQ have different requirements).
15500   if (VT == MVT::v2i64) {
15501     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
15502       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
15503
15504       // First cast everything to the right type.
15505       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15506       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15507
15508       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15509       // bits of the inputs before performing those operations. The lower
15510       // compare is always unsigned.
15511       SDValue SB;
15512       if (FlipSigns) {
15513         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
15514       } else {
15515         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
15516         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
15517         SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
15518       }
15519       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15520       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15521
15522       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15523       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15524       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15525
15526       // Create masks for only the low parts/high parts of the 64 bit integers.
15527       static const int MaskHi[] = { 1, 1, 3, 3 };
15528       static const int MaskLo[] = { 0, 0, 2, 2 };
15529       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15530       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15531       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15532
15533       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15534       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15535
15536       if (Invert)
15537         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15538
15539       return DAG.getBitcast(VT, Result);
15540     }
15541
15542     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
15543       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15544       // pcmpeqd + pshufd + pand.
15545       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
15546
15547       // First cast everything to the right type.
15548       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15549       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15550
15551       // Do the compare.
15552       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15553
15554       // Make sure the lower and upper halves are both all-ones.
15555       static const int Mask[] = { 1, 0, 3, 2 };
15556       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15557       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15558
15559       if (Invert)
15560         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15561
15562       return DAG.getBitcast(VT, Result);
15563     }
15564   }
15565
15566   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15567   // bits of the inputs before performing those operations.
15568   if (FlipSigns) {
15569     MVT EltVT = VT.getVectorElementType();
15570     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
15571                                  VT);
15572     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15573     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15574   }
15575
15576   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15577
15578   // If the logical-not of the result is required, perform that now.
15579   if (Invert)
15580     Result = DAG.getNOT(dl, Result, VT);
15581
15582   if (MinMax)
15583     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15584
15585   if (Subus)
15586     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15587                          getZeroVector(VT, Subtarget, DAG, dl));
15588
15589   return Result;
15590 }
15591
15592 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15593
15594   MVT VT = Op.getSimpleValueType();
15595
15596   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15597
15598   assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15599          && "SetCC type must be 8-bit or 1-bit integer");
15600   SDValue Op0 = Op.getOperand(0);
15601   SDValue Op1 = Op.getOperand(1);
15602   SDLoc dl(Op);
15603   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15604
15605   // Optimize to BT if possible.
15606   // Lower (X & (1 << N)) == 0 to BT(X, N).
15607   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15608   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15609   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15610       isNullConstant(Op1) &&
15611       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15612     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
15613       if (VT == MVT::i1) {
15614         NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
15615                                DAG.getValueType(MVT::i1));
15616         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15617       }
15618       return NewSetCC;
15619     }
15620   }
15621
15622   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15623   // these.
15624   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
15625       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15626
15627     // If the input is a setcc, then reuse the input setcc or use a new one with
15628     // the inverted condition.
15629     if (Op0.getOpcode() == X86ISD::SETCC) {
15630       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15631       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
15632       if (!Invert)
15633         return Op0;
15634
15635       CCode = X86::GetOppositeBranchCondition(CCode);
15636       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15637                                   DAG.getConstant(CCode, dl, MVT::i8),
15638                                   Op0.getOperand(1));
15639       if (VT == MVT::i1) {
15640         SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15641                             DAG.getValueType(MVT::i1));
15642         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15643       }
15644       return SetCC;
15645     }
15646   }
15647   if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15648     if (isOneConstant(Op1)) {
15649       ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15650       return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
15651     }
15652     if (!isNullConstant(Op1)) {
15653       SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
15654       return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
15655     }
15656   }
15657
15658   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15659   unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
15660   if (X86CC == X86::COND_INVALID)
15661     return SDValue();
15662
15663   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15664   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15665   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15666                               DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
15667   if (VT == MVT::i1) {
15668     SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15669                         DAG.getValueType(MVT::i1));
15670     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15671   }
15672   return SetCC;
15673 }
15674
15675 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
15676   SDValue LHS = Op.getOperand(0);
15677   SDValue RHS = Op.getOperand(1);
15678   SDValue Carry = Op.getOperand(2);
15679   SDValue Cond = Op.getOperand(3);
15680   SDLoc DL(Op);
15681
15682   assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
15683   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
15684
15685   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
15686   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15687   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
15688   SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15689                               DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
15690   if (Op.getSimpleValueType() == MVT::i1) {
15691     SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
15692                         DAG.getValueType(MVT::i1));
15693     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
15694   }
15695   return SetCC;
15696 }
15697
15698 /// Return true if opcode is a X86 logical comparison.
15699 static bool isX86LogicalCmp(SDValue Op) {
15700   unsigned Opc = Op.getNode()->getOpcode();
15701   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15702       Opc == X86ISD::SAHF)
15703     return true;
15704   if (Op.getResNo() == 1 &&
15705       (Opc == X86ISD::ADD ||
15706        Opc == X86ISD::SUB ||
15707        Opc == X86ISD::ADC ||
15708        Opc == X86ISD::SBB ||
15709        Opc == X86ISD::SMUL ||
15710        Opc == X86ISD::UMUL ||
15711        Opc == X86ISD::INC ||
15712        Opc == X86ISD::DEC ||
15713        Opc == X86ISD::OR ||
15714        Opc == X86ISD::XOR ||
15715        Opc == X86ISD::AND))
15716     return true;
15717
15718   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15719     return true;
15720
15721   return false;
15722 }
15723
15724 /// Returns the "condition" node, that may be wrapped with "truncate".
15725 /// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
15726 static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15727   if (V.getOpcode() != ISD::TRUNCATE)
15728     return V;
15729
15730   SDValue VOp0 = V.getOperand(0);
15731   if (VOp0.getOpcode() == ISD::AssertZext &&
15732       V.getValueSizeInBits() ==
15733       cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
15734     return VOp0.getOperand(0);
15735
15736   unsigned InBits = VOp0.getValueSizeInBits();
15737   unsigned Bits = V.getValueSizeInBits();
15738   if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
15739     return V.getOperand(0);
15740   return V;
15741 }
15742
15743 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15744   bool addTest = true;
15745   SDValue Cond  = Op.getOperand(0);
15746   SDValue Op1 = Op.getOperand(1);
15747   SDValue Op2 = Op.getOperand(2);
15748   SDLoc DL(Op);
15749   MVT VT = Op1.getSimpleValueType();
15750   SDValue CC;
15751
15752   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15753   // are available or VBLENDV if AVX is available.
15754   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
15755   if (Cond.getOpcode() == ISD::SETCC &&
15756       ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15757        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
15758       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
15759     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15760     int SSECC = translateX86FSETCC(
15761         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15762
15763     if (SSECC != 8) {
15764       if (Subtarget.hasAVX512()) {
15765         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15766                                   DAG.getConstant(SSECC, DL, MVT::i8));
15767         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15768       }
15769
15770       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15771                                 DAG.getConstant(SSECC, DL, MVT::i8));
15772
15773       // If we have AVX, we can use a variable vector select (VBLENDV) instead
15774       // of 3 logic instructions for size savings and potentially speed.
15775       // Unfortunately, there is no scalar form of VBLENDV.
15776
15777       // If either operand is a constant, don't try this. We can expect to
15778       // optimize away at least one of the logic instructions later in that
15779       // case, so that sequence would be faster than a variable blend.
15780
15781       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
15782       // uses XMM0 as the selection register. That may need just as many
15783       // instructions as the AND/ANDN/OR sequence due to register moves, so
15784       // don't bother.
15785
15786       if (Subtarget.hasAVX() &&
15787           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
15788
15789         // Convert to vectors, do a VSELECT, and convert back to scalar.
15790         // All of the conversions should be optimized away.
15791
15792         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
15793         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
15794         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
15795         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
15796
15797         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
15798         VCmp = DAG.getBitcast(VCmpVT, VCmp);
15799
15800         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
15801
15802         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
15803                            VSel, DAG.getIntPtrConstant(0, DL));
15804       }
15805       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15806       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15807       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15808     }
15809   }
15810
15811   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
15812     SDValue Op1Scalar;
15813     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
15814       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
15815     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
15816       Op1Scalar = Op1.getOperand(0);
15817     SDValue Op2Scalar;
15818     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
15819       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
15820     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
15821       Op2Scalar = Op2.getOperand(0);
15822     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
15823       SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
15824                                       Op1Scalar.getValueType(),
15825                                       Cond, Op1Scalar, Op2Scalar);
15826       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
15827         return DAG.getBitcast(VT, newSelect);
15828       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
15829       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
15830                          DAG.getIntPtrConstant(0, DL));
15831     }
15832   }
15833
15834   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
15835     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
15836     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15837                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
15838     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15839                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
15840     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
15841                                     Cond, Op1, Op2);
15842     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
15843   }
15844
15845   if (Cond.getOpcode() == ISD::SETCC) {
15846     if (SDValue NewCond = LowerSETCC(Cond, DAG))
15847       Cond = NewCond;
15848   }
15849
15850   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15851   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15852   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15853   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15854   if (Cond.getOpcode() == X86ISD::SETCC &&
15855       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15856       isNullConstant(Cond.getOperand(1).getOperand(1))) {
15857     SDValue Cmp = Cond.getOperand(1);
15858
15859     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15860
15861     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15862         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15863       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
15864
15865       SDValue CmpOp0 = Cmp.getOperand(0);
15866       // Apply further optimizations for special cases
15867       // (select (x != 0), -1, 0) -> neg & sbb
15868       // (select (x == 0), 0, -1) -> neg & sbb
15869       if (isNullConstant(Y) &&
15870             (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
15871           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15872           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15873                                     DAG.getConstant(0, DL,
15874                                                     CmpOp0.getValueType()),
15875                                     CmpOp0);
15876           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15877                                     DAG.getConstant(X86::COND_B, DL, MVT::i8),
15878                                     SDValue(Neg.getNode(), 1));
15879           return Res;
15880         }
15881
15882       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15883                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
15884       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15885
15886       SDValue Res =   // Res = 0 or -1.
15887         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15888                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
15889
15890       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
15891         Res = DAG.getNOT(DL, Res, Res.getValueType());
15892
15893       if (!isNullConstant(Op2))
15894         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15895       return Res;
15896     }
15897   }
15898
15899   // Look past (and (setcc_carry (cmp ...)), 1).
15900   if (Cond.getOpcode() == ISD::AND &&
15901       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
15902       isOneConstant(Cond.getOperand(1)))
15903     Cond = Cond.getOperand(0);
15904
15905   // If condition flag is set by a X86ISD::CMP, then use it as the condition
15906   // setting operand in place of the X86ISD::SETCC.
15907   unsigned CondOpcode = Cond.getOpcode();
15908   if (CondOpcode == X86ISD::SETCC ||
15909       CondOpcode == X86ISD::SETCC_CARRY) {
15910     CC = Cond.getOperand(0);
15911
15912     SDValue Cmp = Cond.getOperand(1);
15913     unsigned Opc = Cmp.getOpcode();
15914     MVT VT = Op.getSimpleValueType();
15915
15916     bool IllegalFPCMov = false;
15917     if (VT.isFloatingPoint() && !VT.isVector() &&
15918         !isScalarFPTypeInSSEReg(VT))  // FPStack?
15919       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15920
15921     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15922         Opc == X86ISD::BT) { // FIXME
15923       Cond = Cmp;
15924       addTest = false;
15925     }
15926   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15927              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15928              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15929               Cond.getOperand(0).getValueType() != MVT::i8)) {
15930     SDValue LHS = Cond.getOperand(0);
15931     SDValue RHS = Cond.getOperand(1);
15932     unsigned X86Opcode;
15933     unsigned X86Cond;
15934     SDVTList VTs;
15935     switch (CondOpcode) {
15936     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15937     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15938     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15939     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15940     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15941     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15942     default: llvm_unreachable("unexpected overflowing operator");
15943     }
15944     if (CondOpcode == ISD::UMULO)
15945       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15946                           MVT::i32);
15947     else
15948       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15949
15950     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15951
15952     if (CondOpcode == ISD::UMULO)
15953       Cond = X86Op.getValue(2);
15954     else
15955       Cond = X86Op.getValue(1);
15956
15957     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
15958     addTest = false;
15959   }
15960
15961   if (addTest) {
15962     // Look past the truncate if the high bits are known zero.
15963     Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
15964
15965     // We know the result of AND is compared against zero. Try to match
15966     // it to BT.
15967     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15968       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
15969         CC = NewSetCC.getOperand(0);
15970         Cond = NewSetCC.getOperand(1);
15971         addTest = false;
15972       }
15973     }
15974   }
15975
15976   if (addTest) {
15977     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
15978     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15979   }
15980
15981   // a <  b ? -1 :  0 -> RES = ~setcc_carry
15982   // a <  b ?  0 : -1 -> RES = setcc_carry
15983   // a >= b ? -1 :  0 -> RES = setcc_carry
15984   // a >= b ?  0 : -1 -> RES = ~setcc_carry
15985   if (Cond.getOpcode() == X86ISD::SUB) {
15986     Cond = ConvertCmpIfNecessary(Cond, DAG);
15987     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15988
15989     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15990         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15991         (isNullConstant(Op1) || isNullConstant(Op2))) {
15992       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15993                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
15994                                 Cond);
15995       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
15996         return DAG.getNOT(DL, Res, Res.getValueType());
15997       return Res;
15998     }
15999   }
16000
16001   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16002   // widen the cmov and push the truncate through. This avoids introducing a new
16003   // branch during isel and doesn't add any extensions.
16004   if (Op.getValueType() == MVT::i8 &&
16005       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16006     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16007     if (T1.getValueType() == T2.getValueType() &&
16008         // Blacklist CopyFromReg to avoid partial register stalls.
16009         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16010       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16011       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16012       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16013     }
16014   }
16015
16016   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16017   // condition is true.
16018   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16019   SDValue Ops[] = { Op2, Op1, CC, Cond };
16020   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16021 }
16022
16023 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
16024                                        const X86Subtarget &Subtarget,
16025                                        SelectionDAG &DAG) {
16026   MVT VT = Op->getSimpleValueType(0);
16027   SDValue In = Op->getOperand(0);
16028   MVT InVT = In.getSimpleValueType();
16029   MVT VTElt = VT.getVectorElementType();
16030   MVT InVTElt = InVT.getVectorElementType();
16031   SDLoc dl(Op);
16032
16033   // SKX processor
16034   if ((InVTElt == MVT::i1) &&
16035       (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
16036         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16037
16038        ((Subtarget.hasBWI() && VT.is512BitVector() &&
16039         VTElt.getSizeInBits() <= 16)) ||
16040
16041        ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
16042         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16043
16044        ((Subtarget.hasDQI() && VT.is512BitVector() &&
16045         VTElt.getSizeInBits() >= 32))))
16046     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16047
16048   unsigned int NumElts = VT.getVectorNumElements();
16049
16050   if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
16051     return SDValue();
16052
16053   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16054     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16055       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16056     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16057   }
16058
16059   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16060   MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
16061   SDValue NegOne =
16062    DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
16063                    ExtVT);
16064   SDValue Zero =
16065    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
16066
16067   SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
16068   if (VT.is512BitVector())
16069     return V;
16070   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
16071 }
16072
16073 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
16074                                              const X86Subtarget &Subtarget,
16075                                              SelectionDAG &DAG) {
16076   SDValue In = Op->getOperand(0);
16077   MVT VT = Op->getSimpleValueType(0);
16078   MVT InVT = In.getSimpleValueType();
16079   assert(VT.getSizeInBits() == InVT.getSizeInBits());
16080
16081   MVT SVT = VT.getVectorElementType();
16082   MVT InSVT = InVT.getVectorElementType();
16083   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
16084
16085   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
16086     return SDValue();
16087   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
16088     return SDValue();
16089   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
16090       !(VT.is256BitVector() && Subtarget.hasInt256()))
16091     return SDValue();
16092
16093   SDLoc dl(Op);
16094
16095   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
16096   if (VT.is256BitVector())
16097     In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
16098                      MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
16099                      In, DAG.getIntPtrConstant(0, dl));
16100
16101   // SSE41 targets can use the pmovsx* instructions directly.
16102   if (Subtarget.hasSSE41())
16103     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16104
16105   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
16106   SDValue Curr = In;
16107   MVT CurrVT = InVT;
16108
16109   // As SRAI is only available on i16/i32 types, we expand only up to i32
16110   // and handle i64 separately.
16111   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
16112     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
16113     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
16114     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
16115     Curr = DAG.getBitcast(CurrVT, Curr);
16116   }
16117
16118   SDValue SignExt = Curr;
16119   if (CurrVT != InVT) {
16120     unsigned SignExtShift =
16121         CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
16122     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16123                           DAG.getConstant(SignExtShift, dl, MVT::i8));
16124   }
16125
16126   if (CurrVT == VT)
16127     return SignExt;
16128
16129   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
16130     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16131                                DAG.getConstant(31, dl, MVT::i8));
16132     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
16133     return DAG.getBitcast(VT, Ext);
16134   }
16135
16136   return SDValue();
16137 }
16138
16139 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16140                                 SelectionDAG &DAG) {
16141   MVT VT = Op->getSimpleValueType(0);
16142   SDValue In = Op->getOperand(0);
16143   MVT InVT = In.getSimpleValueType();
16144   SDLoc dl(Op);
16145
16146   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16147     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16148
16149   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16150       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16151       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16152     return SDValue();
16153
16154   if (Subtarget.hasInt256())
16155     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16156
16157   // Optimize vectors in AVX mode
16158   // Sign extend  v8i16 to v8i32 and
16159   //              v4i32 to v4i64
16160   //
16161   // Divide input vector into two parts
16162   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16163   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16164   // concat the vectors to original VT
16165
16166   unsigned NumElems = InVT.getVectorNumElements();
16167   SDValue Undef = DAG.getUNDEF(InVT);
16168
16169   SmallVector<int,8> ShufMask1(NumElems, -1);
16170   for (unsigned i = 0; i != NumElems/2; ++i)
16171     ShufMask1[i] = i;
16172
16173   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
16174
16175   SmallVector<int,8> ShufMask2(NumElems, -1);
16176   for (unsigned i = 0; i != NumElems/2; ++i)
16177     ShufMask2[i] = i + NumElems/2;
16178
16179   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
16180
16181   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
16182                                 VT.getVectorNumElements()/2);
16183
16184   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16185   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16186
16187   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16188 }
16189
16190 // Lower truncating store. We need a special lowering to vXi1 vectors
16191 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
16192                                     SelectionDAG &DAG) {
16193   StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
16194   SDLoc dl(St);
16195   EVT MemVT = St->getMemoryVT();
16196   assert(St->isTruncatingStore() && "We only custom truncating store.");
16197   assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
16198          "Expected truncstore of i1 vector");
16199
16200   SDValue Op = St->getValue();
16201   MVT OpVT = Op.getValueType().getSimpleVT();
16202   unsigned NumElts = OpVT.getVectorNumElements();
16203   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16204       NumElts == 16) {
16205     // Truncate and store - everything is legal
16206     Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
16207     if (MemVT.getSizeInBits() < 8)
16208       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
16209                        DAG.getUNDEF(MVT::v8i1), Op,
16210                        DAG.getIntPtrConstant(0, dl));
16211     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16212                         St->getMemOperand());
16213   }
16214
16215   // A subset, assume that we have only AVX-512F
16216   if (NumElts <= 8) {
16217     if (NumElts < 8) {
16218       // Extend to 8-elts vector
16219       MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
16220       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
16221                         DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
16222     }
16223     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
16224     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16225                         St->getMemOperand());
16226   }
16227   // v32i8
16228   assert(OpVT == MVT::v32i8 && "Unexpected operand type");
16229   // Divide the vector into 2 parts and store each part separately
16230   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16231                             DAG.getIntPtrConstant(0, dl));
16232   Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
16233   SDValue BasePtr = St->getBasePtr();
16234   SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
16235                               St->getMemOperand());
16236   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16237                             DAG.getIntPtrConstant(16, dl));
16238   Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
16239
16240   SDValue BasePtrHi =
16241     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16242                 DAG.getConstant(2, dl, BasePtr.getValueType()));
16243
16244   SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
16245                               BasePtrHi, St->getMemOperand());
16246   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
16247 }
16248
16249 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
16250                                            const X86Subtarget &Subtarget,
16251                                            SelectionDAG &DAG) {
16252
16253   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16254   SDLoc dl(Ld);
16255   EVT MemVT = Ld->getMemoryVT();
16256   assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
16257          "Expected i1 vector load");
16258   unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
16259     ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16260   MVT VT = Op.getValueType().getSimpleVT();
16261   unsigned NumElts = VT.getVectorNumElements();
16262
16263   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16264       NumElts == 16) {
16265     // Load and extend - everything is legal
16266     if (NumElts < 8) {
16267       SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
16268                                  Ld->getBasePtr(),
16269                                  Ld->getMemOperand());
16270       // Replace chain users with the new chain.
16271       assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16272       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16273       MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16274       SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
16275
16276       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16277                                    DAG.getIntPtrConstant(0, dl));
16278     }
16279     SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
16280                                Ld->getBasePtr(),
16281                                Ld->getMemOperand());
16282     // Replace chain users with the new chain.
16283     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16284     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16285
16286     // Finally, do a normal sign-extend to the desired register.
16287     return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
16288   }
16289
16290   if (NumElts <= 8) {
16291     // A subset, assume that we have only AVX-512F
16292     unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
16293     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
16294     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
16295                               Ld->getBasePtr(),
16296                               Ld->getMemOperand());
16297     // Replace chain users with the new chain.
16298     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16299     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16300
16301     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
16302     SDValue BitVec = DAG.getBitcast(MaskVT, Load);
16303
16304     if (NumElts == 8)
16305       return DAG.getNode(ExtOpcode, dl, VT, BitVec);
16306
16307       // we should take care to v4i1 and v2i1
16308
16309     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16310     SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
16311     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16312                         DAG.getIntPtrConstant(0, dl));
16313   }
16314
16315   assert(VT == MVT::v32i8 && "Unexpected extload type");
16316
16317   SmallVector<SDValue, 2> Chains;
16318
16319   SDValue BasePtr = Ld->getBasePtr();
16320   SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16321                                Ld->getBasePtr(),
16322                                Ld->getMemOperand());
16323   Chains.push_back(LoadLo.getValue(1));
16324
16325   SDValue BasePtrHi =
16326     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16327                 DAG.getConstant(2, dl, BasePtr.getValueType()));
16328
16329   SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16330                                BasePtrHi,
16331                                Ld->getMemOperand());
16332   Chains.push_back(LoadHi.getValue(1));
16333   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16334   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
16335
16336   SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
16337   SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
16338   return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
16339 }
16340
16341 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16342 // may emit an illegal shuffle but the expansion is still better than scalar
16343 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16344 // we'll emit a shuffle and a arithmetic shift.
16345 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16346 // TODO: It is possible to support ZExt by zeroing the undef values during
16347 // the shuffle phase or after the shuffle.
16348 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
16349                                  SelectionDAG &DAG) {
16350   MVT RegVT = Op.getSimpleValueType();
16351   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16352   assert(RegVT.isInteger() &&
16353          "We only custom lower integer vector sext loads.");
16354
16355   // Nothing useful we can do without SSE2 shuffles.
16356   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
16357
16358   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16359   SDLoc dl(Ld);
16360   EVT MemVT = Ld->getMemoryVT();
16361   if (MemVT.getScalarType() == MVT::i1)
16362     return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
16363
16364   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16365   unsigned RegSz = RegVT.getSizeInBits();
16366
16367   ISD::LoadExtType Ext = Ld->getExtensionType();
16368
16369   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16370          && "Only anyext and sext are currently implemented.");
16371   assert(MemVT != RegVT && "Cannot extend to the same type");
16372   assert(MemVT.isVector() && "Must load a vector from memory");
16373
16374   unsigned NumElems = RegVT.getVectorNumElements();
16375   unsigned MemSz = MemVT.getSizeInBits();
16376   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16377
16378   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
16379     // The only way in which we have a legal 256-bit vector result but not the
16380     // integer 256-bit operations needed to directly lower a sextload is if we
16381     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16382     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16383     // correctly legalized. We do this late to allow the canonical form of
16384     // sextload to persist throughout the rest of the DAG combiner -- it wants
16385     // to fold together any extensions it can, and so will fuse a sign_extend
16386     // of an sextload into a sextload targeting a wider value.
16387     SDValue Load;
16388     if (MemSz == 128) {
16389       // Just switch this to a normal load.
16390       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16391                                        "it must be a legal 128-bit vector "
16392                                        "type!");
16393       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16394                          Ld->getPointerInfo(), Ld->getAlignment(),
16395                          Ld->getMemOperand()->getFlags());
16396     } else {
16397       assert(MemSz < 128 &&
16398              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16399       // Do an sext load to a 128-bit vector type. We want to use the same
16400       // number of elements, but elements half as wide. This will end up being
16401       // recursively lowered by this routine, but will succeed as we definitely
16402       // have all the necessary features if we're using AVX1.
16403       EVT HalfEltVT =
16404           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16405       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16406       Load =
16407           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16408                          Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
16409                          Ld->getMemOperand()->getFlags());
16410     }
16411
16412     // Replace chain users with the new chain.
16413     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16414     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16415
16416     // Finally, do a normal sign-extend to the desired register.
16417     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16418   }
16419
16420   // All sizes must be a power of two.
16421   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16422          "Non-power-of-two elements are not custom lowered!");
16423
16424   // Attempt to load the original value using scalar loads.
16425   // Find the largest scalar type that divides the total loaded size.
16426   MVT SclrLoadTy = MVT::i8;
16427   for (MVT Tp : MVT::integer_valuetypes()) {
16428     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16429       SclrLoadTy = Tp;
16430     }
16431   }
16432
16433   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16434   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16435       (64 <= MemSz))
16436     SclrLoadTy = MVT::f64;
16437
16438   // Calculate the number of scalar loads that we need to perform
16439   // in order to load our vector from memory.
16440   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16441
16442   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16443          "Can only lower sext loads with a single scalar load!");
16444
16445   unsigned loadRegZize = RegSz;
16446   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
16447     loadRegZize = 128;
16448
16449   // Represent our vector as a sequence of elements which are the
16450   // largest scalar that we can load.
16451   EVT LoadUnitVecVT = EVT::getVectorVT(
16452       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16453
16454   // Represent the data using the same element type that is stored in
16455   // memory. In practice, we ''widen'' MemVT.
16456   EVT WideVecVT =
16457       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16458                        loadRegZize / MemVT.getScalarSizeInBits());
16459
16460   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16461          "Invalid vector type");
16462
16463   // We can't shuffle using an illegal type.
16464   assert(TLI.isTypeLegal(WideVecVT) &&
16465          "We only lower types that form legal widened vector types");
16466
16467   SmallVector<SDValue, 8> Chains;
16468   SDValue Ptr = Ld->getBasePtr();
16469   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
16470                                       TLI.getPointerTy(DAG.getDataLayout()));
16471   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16472
16473   for (unsigned i = 0; i < NumLoads; ++i) {
16474     // Perform a single load.
16475     SDValue ScalarLoad =
16476         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16477                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
16478     Chains.push_back(ScalarLoad.getValue(1));
16479     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16480     // another round of DAGCombining.
16481     if (i == 0)
16482       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16483     else
16484       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16485                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
16486
16487     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16488   }
16489
16490   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16491
16492   // Bitcast the loaded value to a vector of the original element type, in
16493   // the size of the target vector type.
16494   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
16495   unsigned SizeRatio = RegSz / MemSz;
16496
16497   if (Ext == ISD::SEXTLOAD) {
16498     // If we have SSE4.1, we can directly emit a VSEXT node.
16499     if (Subtarget.hasSSE41()) {
16500       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16501       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16502       return Sext;
16503     }
16504
16505     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
16506     // lanes.
16507     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
16508            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
16509
16510     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
16511     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16512     return Shuff;
16513   }
16514
16515   // Redistribute the loaded elements into the different locations.
16516   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16517   for (unsigned i = 0; i != NumElems; ++i)
16518     ShuffleVec[i * SizeRatio] = i;
16519
16520   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16521                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
16522
16523   // Bitcast to the requested type.
16524   Shuff = DAG.getBitcast(RegVT, Shuff);
16525   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16526   return Shuff;
16527 }
16528
16529 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
16530 /// each of which has no other use apart from the AND / OR.
16531 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16532   Opc = Op.getOpcode();
16533   if (Opc != ISD::OR && Opc != ISD::AND)
16534     return false;
16535   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16536           Op.getOperand(0).hasOneUse() &&
16537           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16538           Op.getOperand(1).hasOneUse());
16539 }
16540
16541 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
16542 /// SETCC node has a single use.
16543 static bool isXor1OfSetCC(SDValue Op) {
16544   if (Op.getOpcode() != ISD::XOR)
16545     return false;
16546   if (isOneConstant(Op.getOperand(1)))
16547     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16548            Op.getOperand(0).hasOneUse();
16549   return false;
16550 }
16551
16552 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16553   bool addTest = true;
16554   SDValue Chain = Op.getOperand(0);
16555   SDValue Cond  = Op.getOperand(1);
16556   SDValue Dest  = Op.getOperand(2);
16557   SDLoc dl(Op);
16558   SDValue CC;
16559   bool Inverted = false;
16560
16561   if (Cond.getOpcode() == ISD::SETCC) {
16562     // Check for setcc([su]{add,sub,mul}o == 0).
16563     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16564         isNullConstant(Cond.getOperand(1)) &&
16565         Cond.getOperand(0).getResNo() == 1 &&
16566         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16567          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16568          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16569          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16570          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16571          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16572       Inverted = true;
16573       Cond = Cond.getOperand(0);
16574     } else {
16575       if (SDValue NewCond = LowerSETCC(Cond, DAG))
16576         Cond = NewCond;
16577     }
16578   }
16579 #if 0
16580   // FIXME: LowerXALUO doesn't handle these!!
16581   else if (Cond.getOpcode() == X86ISD::ADD  ||
16582            Cond.getOpcode() == X86ISD::SUB  ||
16583            Cond.getOpcode() == X86ISD::SMUL ||
16584            Cond.getOpcode() == X86ISD::UMUL)
16585     Cond = LowerXALUO(Cond, DAG);
16586 #endif
16587
16588   // Look pass (and (setcc_carry (cmp ...)), 1).
16589   if (Cond.getOpcode() == ISD::AND &&
16590       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
16591       isOneConstant(Cond.getOperand(1)))
16592     Cond = Cond.getOperand(0);
16593
16594   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16595   // setting operand in place of the X86ISD::SETCC.
16596   unsigned CondOpcode = Cond.getOpcode();
16597   if (CondOpcode == X86ISD::SETCC ||
16598       CondOpcode == X86ISD::SETCC_CARRY) {
16599     CC = Cond.getOperand(0);
16600
16601     SDValue Cmp = Cond.getOperand(1);
16602     unsigned Opc = Cmp.getOpcode();
16603     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16604     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16605       Cond = Cmp;
16606       addTest = false;
16607     } else {
16608       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16609       default: break;
16610       case X86::COND_O:
16611       case X86::COND_B:
16612         // These can only come from an arithmetic instruction with overflow,
16613         // e.g. SADDO, UADDO.
16614         Cond = Cond.getNode()->getOperand(1);
16615         addTest = false;
16616         break;
16617       }
16618     }
16619   }
16620   CondOpcode = Cond.getOpcode();
16621   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16622       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16623       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16624        Cond.getOperand(0).getValueType() != MVT::i8)) {
16625     SDValue LHS = Cond.getOperand(0);
16626     SDValue RHS = Cond.getOperand(1);
16627     unsigned X86Opcode;
16628     unsigned X86Cond;
16629     SDVTList VTs;
16630     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16631     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16632     // X86ISD::INC).
16633     switch (CondOpcode) {
16634     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16635     case ISD::SADDO:
16636       if (isOneConstant(RHS)) {
16637           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16638           break;
16639         }
16640       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16641     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16642     case ISD::SSUBO:
16643       if (isOneConstant(RHS)) {
16644           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16645           break;
16646         }
16647       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16648     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16649     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16650     default: llvm_unreachable("unexpected overflowing operator");
16651     }
16652     if (Inverted)
16653       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16654     if (CondOpcode == ISD::UMULO)
16655       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16656                           MVT::i32);
16657     else
16658       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16659
16660     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16661
16662     if (CondOpcode == ISD::UMULO)
16663       Cond = X86Op.getValue(2);
16664     else
16665       Cond = X86Op.getValue(1);
16666
16667     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16668     addTest = false;
16669   } else {
16670     unsigned CondOpc;
16671     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16672       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16673       if (CondOpc == ISD::OR) {
16674         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16675         // two branches instead of an explicit OR instruction with a
16676         // separate test.
16677         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16678             isX86LogicalCmp(Cmp)) {
16679           CC = Cond.getOperand(0).getOperand(0);
16680           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16681                               Chain, Dest, CC, Cmp);
16682           CC = Cond.getOperand(1).getOperand(0);
16683           Cond = Cmp;
16684           addTest = false;
16685         }
16686       } else { // ISD::AND
16687         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16688         // two branches instead of an explicit AND instruction with a
16689         // separate test. However, we only do this if this block doesn't
16690         // have a fall-through edge, because this requires an explicit
16691         // jmp when the condition is false.
16692         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16693             isX86LogicalCmp(Cmp) &&
16694             Op.getNode()->hasOneUse()) {
16695           X86::CondCode CCode =
16696             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16697           CCode = X86::GetOppositeBranchCondition(CCode);
16698           CC = DAG.getConstant(CCode, dl, MVT::i8);
16699           SDNode *User = *Op.getNode()->use_begin();
16700           // Look for an unconditional branch following this conditional branch.
16701           // We need this because we need to reverse the successors in order
16702           // to implement FCMP_OEQ.
16703           if (User->getOpcode() == ISD::BR) {
16704             SDValue FalseBB = User->getOperand(1);
16705             SDNode *NewBR =
16706               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16707             assert(NewBR == User);
16708             (void)NewBR;
16709             Dest = FalseBB;
16710
16711             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16712                                 Chain, Dest, CC, Cmp);
16713             X86::CondCode CCode =
16714               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16715             CCode = X86::GetOppositeBranchCondition(CCode);
16716             CC = DAG.getConstant(CCode, dl, MVT::i8);
16717             Cond = Cmp;
16718             addTest = false;
16719           }
16720         }
16721       }
16722     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16723       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16724       // It should be transformed during dag combiner except when the condition
16725       // is set by a arithmetics with overflow node.
16726       X86::CondCode CCode =
16727         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16728       CCode = X86::GetOppositeBranchCondition(CCode);
16729       CC = DAG.getConstant(CCode, dl, MVT::i8);
16730       Cond = Cond.getOperand(0).getOperand(1);
16731       addTest = false;
16732     } else if (Cond.getOpcode() == ISD::SETCC &&
16733                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16734       // For FCMP_OEQ, we can emit
16735       // two branches instead of an explicit AND instruction with a
16736       // separate test. However, we only do this if this block doesn't
16737       // have a fall-through edge, because this requires an explicit
16738       // jmp when the condition is false.
16739       if (Op.getNode()->hasOneUse()) {
16740         SDNode *User = *Op.getNode()->use_begin();
16741         // Look for an unconditional branch following this conditional branch.
16742         // We need this because we need to reverse the successors in order
16743         // to implement FCMP_OEQ.
16744         if (User->getOpcode() == ISD::BR) {
16745           SDValue FalseBB = User->getOperand(1);
16746           SDNode *NewBR =
16747             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16748           assert(NewBR == User);
16749           (void)NewBR;
16750           Dest = FalseBB;
16751
16752           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16753                                     Cond.getOperand(0), Cond.getOperand(1));
16754           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16755           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16756           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16757                               Chain, Dest, CC, Cmp);
16758           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
16759           Cond = Cmp;
16760           addTest = false;
16761         }
16762       }
16763     } else if (Cond.getOpcode() == ISD::SETCC &&
16764                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16765       // For FCMP_UNE, we can emit
16766       // two branches instead of an explicit AND instruction with a
16767       // separate test. However, we only do this if this block doesn't
16768       // have a fall-through edge, because this requires an explicit
16769       // jmp when the condition is false.
16770       if (Op.getNode()->hasOneUse()) {
16771         SDNode *User = *Op.getNode()->use_begin();
16772         // Look for an unconditional branch following this conditional branch.
16773         // We need this because we need to reverse the successors in order
16774         // to implement FCMP_UNE.
16775         if (User->getOpcode() == ISD::BR) {
16776           SDValue FalseBB = User->getOperand(1);
16777           SDNode *NewBR =
16778             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16779           assert(NewBR == User);
16780           (void)NewBR;
16781
16782           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16783                                     Cond.getOperand(0), Cond.getOperand(1));
16784           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16785           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16786           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16787                               Chain, Dest, CC, Cmp);
16788           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
16789           Cond = Cmp;
16790           addTest = false;
16791           Dest = FalseBB;
16792         }
16793       }
16794     }
16795   }
16796
16797   if (addTest) {
16798     // Look pass the truncate if the high bits are known zero.
16799     Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
16800
16801     // We know the result of AND is compared against zero. Try to match
16802     // it to BT.
16803     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16804       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
16805         CC = NewSetCC.getOperand(0);
16806         Cond = NewSetCC.getOperand(1);
16807         addTest = false;
16808       }
16809     }
16810   }
16811
16812   if (addTest) {
16813     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16814     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16815     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16816   }
16817   Cond = ConvertCmpIfNecessary(Cond, DAG);
16818   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16819                      Chain, Dest, CC, Cond);
16820 }
16821
16822 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16823 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16824 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16825 // that the guard pages used by the OS virtual memory manager are allocated in
16826 // correct sequence.
16827 SDValue
16828 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16829                                            SelectionDAG &DAG) const {
16830   MachineFunction &MF = DAG.getMachineFunction();
16831   bool SplitStack = MF.shouldSplitStack();
16832   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
16833                SplitStack;
16834   SDLoc dl(Op);
16835
16836   // Get the inputs.
16837   SDNode *Node = Op.getNode();
16838   SDValue Chain = Op.getOperand(0);
16839   SDValue Size  = Op.getOperand(1);
16840   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16841   EVT VT = Node->getValueType(0);
16842
16843   // Chain the dynamic stack allocation so that it doesn't modify the stack
16844   // pointer when other instructions are using the stack.
16845   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
16846
16847   bool Is64Bit = Subtarget.is64Bit();
16848   MVT SPTy = getPointerTy(DAG.getDataLayout());
16849
16850   SDValue Result;
16851   if (!Lower) {
16852     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16853     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16854     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16855                     " not tell us which reg is the stack pointer!");
16856
16857     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16858     Chain = SP.getValue(1);
16859     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
16860     unsigned StackAlign = TFI.getStackAlignment();
16861     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16862     if (Align > StackAlign)
16863       Result = DAG.getNode(ISD::AND, dl, VT, Result,
16864                          DAG.getConstant(-(uint64_t)Align, dl, VT));
16865     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
16866   } else if (SplitStack) {
16867     MachineRegisterInfo &MRI = MF.getRegInfo();
16868
16869     if (Is64Bit) {
16870       // The 64 bit implementation of segmented stacks needs to clobber both r10
16871       // r11. This makes it impossible to use it along with nested parameters.
16872       const Function *F = MF.getFunction();
16873       for (const auto &A : F->args()) {
16874         if (A.hasNestAttr())
16875           report_fatal_error("Cannot use segmented stacks with functions that "
16876                              "have nested arguments.");
16877       }
16878     }
16879
16880     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
16881     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16882     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16883     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16884                                 DAG.getRegister(Vreg, SPTy));
16885   } else {
16886     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16887     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
16888     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
16889
16890     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
16891     unsigned SPReg = RegInfo->getStackRegister();
16892     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16893     Chain = SP.getValue(1);
16894
16895     if (Align) {
16896       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16897                        DAG.getConstant(-(uint64_t)Align, dl, VT));
16898       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16899     }
16900
16901     Result = SP;
16902   }
16903
16904   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
16905                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
16906
16907   SDValue Ops[2] = {Result, Chain};
16908   return DAG.getMergeValues(Ops, dl);
16909 }
16910
16911 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16912   MachineFunction &MF = DAG.getMachineFunction();
16913   auto PtrVT = getPointerTy(MF.getDataLayout());
16914   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16915
16916   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16917   SDLoc DL(Op);
16918
16919   if (!Subtarget.is64Bit() ||
16920       Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
16921     // vastart just stores the address of the VarArgsFrameIndex slot into the
16922     // memory location argument.
16923     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16924     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16925                         MachinePointerInfo(SV));
16926   }
16927
16928   // __va_list_tag:
16929   //   gp_offset         (0 - 6 * 8)
16930   //   fp_offset         (48 - 48 + 8 * 16)
16931   //   overflow_arg_area (point to parameters coming in memory).
16932   //   reg_save_area
16933   SmallVector<SDValue, 8> MemOps;
16934   SDValue FIN = Op.getOperand(1);
16935   // Store gp_offset
16936   SDValue Store = DAG.getStore(
16937       Op.getOperand(0), DL,
16938       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
16939       MachinePointerInfo(SV));
16940   MemOps.push_back(Store);
16941
16942   // Store fp_offset
16943   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
16944   Store = DAG.getStore(
16945       Op.getOperand(0), DL,
16946       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
16947       MachinePointerInfo(SV, 4));
16948   MemOps.push_back(Store);
16949
16950   // Store ptr to overflow_arg_area
16951   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
16952   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16953   Store =
16954       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
16955   MemOps.push_back(Store);
16956
16957   // Store ptr to reg_save_area.
16958   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
16959       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
16960   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
16961   Store = DAG.getStore(
16962       Op.getOperand(0), DL, RSFIN, FIN,
16963       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
16964   MemOps.push_back(Store);
16965   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16966 }
16967
16968 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16969   assert(Subtarget.is64Bit() &&
16970          "LowerVAARG only handles 64-bit va_arg!");
16971   assert(Op.getNode()->getNumOperands() == 4);
16972
16973   MachineFunction &MF = DAG.getMachineFunction();
16974   if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
16975     // The Win64 ABI uses char* instead of a structure.
16976     return DAG.expandVAArg(Op.getNode());
16977
16978   SDValue Chain = Op.getOperand(0);
16979   SDValue SrcPtr = Op.getOperand(1);
16980   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16981   unsigned Align = Op.getConstantOperandVal(3);
16982   SDLoc dl(Op);
16983
16984   EVT ArgVT = Op.getNode()->getValueType(0);
16985   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16986   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
16987   uint8_t ArgMode;
16988
16989   // Decide which area this value should be read from.
16990   // TODO: Implement the AMD64 ABI in its entirety. This simple
16991   // selection mechanism works only for the basic types.
16992   if (ArgVT == MVT::f80) {
16993     llvm_unreachable("va_arg for f80 not yet implemented");
16994   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16995     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16996   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16997     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16998   } else {
16999     llvm_unreachable("Unhandled argument type in LowerVAARG");
17000   }
17001
17002   if (ArgMode == 2) {
17003     // Sanity Check: Make sure using fp_offset makes sense.
17004     assert(!Subtarget.useSoftFloat() &&
17005            !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
17006            Subtarget.hasSSE1());
17007   }
17008
17009   // Insert VAARG_64 node into the DAG
17010   // VAARG_64 returns two values: Variable Argument Address, Chain
17011   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
17012                        DAG.getConstant(ArgMode, dl, MVT::i8),
17013                        DAG.getConstant(Align, dl, MVT::i32)};
17014   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
17015   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17016                                           VTs, InstOps, MVT::i64,
17017                                           MachinePointerInfo(SV),
17018                                           /*Align=*/0,
17019                                           /*Volatile=*/false,
17020                                           /*ReadMem=*/true,
17021                                           /*WriteMem=*/true);
17022   Chain = VAARG.getValue(1);
17023
17024   // Load the next argument and return it
17025   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
17026 }
17027
17028 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
17029                            SelectionDAG &DAG) {
17030   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
17031   // where a va_list is still an i8*.
17032   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
17033   if (Subtarget.isCallingConvWin64(
17034         DAG.getMachineFunction().getFunction()->getCallingConv()))
17035     // Probably a Win64 va_copy.
17036     return DAG.expandVACopy(Op.getNode());
17037
17038   SDValue Chain = Op.getOperand(0);
17039   SDValue DstPtr = Op.getOperand(1);
17040   SDValue SrcPtr = Op.getOperand(2);
17041   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17042   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17043   SDLoc DL(Op);
17044
17045   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17046                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
17047                        false, false,
17048                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17049 }
17050
17051 /// Handle vector element shifts where the shift amount is a constant.
17052 /// Takes immediate version of shift as input.
17053 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
17054                                           SDValue SrcOp, uint64_t ShiftAmt,
17055                                           SelectionDAG &DAG) {
17056   MVT ElementType = VT.getVectorElementType();
17057
17058   // Fold this packed shift into its first operand if ShiftAmt is 0.
17059   if (ShiftAmt == 0)
17060     return SrcOp;
17061
17062   // Check for ShiftAmt >= element width
17063   if (ShiftAmt >= ElementType.getSizeInBits()) {
17064     if (Opc == X86ISD::VSRAI)
17065       ShiftAmt = ElementType.getSizeInBits() - 1;
17066     else
17067       return DAG.getConstant(0, dl, VT);
17068   }
17069
17070   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17071          && "Unknown target vector shift-by-constant node");
17072
17073   // Fold this packed vector shift into a build vector if SrcOp is a
17074   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17075   if (VT == SrcOp.getSimpleValueType() &&
17076       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17077     SmallVector<SDValue, 8> Elts;
17078     unsigned NumElts = SrcOp->getNumOperands();
17079     ConstantSDNode *ND;
17080
17081     switch(Opc) {
17082     default: llvm_unreachable("Unknown opcode!");
17083     case X86ISD::VSHLI:
17084       for (unsigned i=0; i!=NumElts; ++i) {
17085         SDValue CurrentOp = SrcOp->getOperand(i);
17086         if (CurrentOp->isUndef()) {
17087           Elts.push_back(CurrentOp);
17088           continue;
17089         }
17090         ND = cast<ConstantSDNode>(CurrentOp);
17091         const APInt &C = ND->getAPIntValue();
17092         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
17093       }
17094       break;
17095     case X86ISD::VSRLI:
17096       for (unsigned i=0; i!=NumElts; ++i) {
17097         SDValue CurrentOp = SrcOp->getOperand(i);
17098         if (CurrentOp->isUndef()) {
17099           Elts.push_back(CurrentOp);
17100           continue;
17101         }
17102         ND = cast<ConstantSDNode>(CurrentOp);
17103         const APInt &C = ND->getAPIntValue();
17104         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
17105       }
17106       break;
17107     case X86ISD::VSRAI:
17108       for (unsigned i=0; i!=NumElts; ++i) {
17109         SDValue CurrentOp = SrcOp->getOperand(i);
17110         if (CurrentOp->isUndef()) {
17111           Elts.push_back(CurrentOp);
17112           continue;
17113         }
17114         ND = cast<ConstantSDNode>(CurrentOp);
17115         const APInt &C = ND->getAPIntValue();
17116         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
17117       }
17118       break;
17119     }
17120
17121     return DAG.getBuildVector(VT, dl, Elts);
17122   }
17123
17124   return DAG.getNode(Opc, dl, VT, SrcOp,
17125                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
17126 }
17127
17128 /// Handle vector element shifts where the shift amount may or may not be a
17129 /// constant. Takes immediate version of shift as input.
17130 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
17131                                    SDValue SrcOp, SDValue ShAmt,
17132                                    SelectionDAG &DAG) {
17133   MVT SVT = ShAmt.getSimpleValueType();
17134   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17135
17136   // Catch shift-by-constant.
17137   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17138     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17139                                       CShAmt->getZExtValue(), DAG);
17140
17141   // Change opcode to non-immediate version
17142   switch (Opc) {
17143     default: llvm_unreachable("Unknown target vector shift node");
17144     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17145     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17146     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17147   }
17148
17149   const X86Subtarget &Subtarget =
17150       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17151   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17152       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17153     // Let the shuffle legalizer expand this shift amount node.
17154     SDValue Op0 = ShAmt.getOperand(0);
17155     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17156     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
17157   } else {
17158     // Need to build a vector containing shift amount.
17159     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17160     SmallVector<SDValue, 4> ShOps;
17161     ShOps.push_back(ShAmt);
17162     if (SVT == MVT::i32) {
17163       ShOps.push_back(DAG.getConstant(0, dl, SVT));
17164       ShOps.push_back(DAG.getUNDEF(SVT));
17165     }
17166     ShOps.push_back(DAG.getUNDEF(SVT));
17167
17168     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17169     ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
17170   }
17171
17172   // The return type has to be a 128-bit type with the same element
17173   // type as the input type.
17174   MVT EltVT = VT.getVectorElementType();
17175   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17176
17177   ShAmt = DAG.getBitcast(ShVT, ShAmt);
17178   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17179 }
17180
17181 /// \brief Return Mask with the necessary casting or extending
17182 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
17183 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
17184                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
17185                            const SDLoc &dl) {
17186
17187   if (isAllOnesConstant(Mask))
17188     return DAG.getTargetConstant(1, dl, MaskVT);
17189   if (X86::isZeroNode(Mask))
17190     return DAG.getTargetConstant(0, dl, MaskVT);
17191
17192   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
17193     // Mask should be extended
17194     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
17195                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
17196   }
17197
17198   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
17199     if (MaskVT == MVT::v64i1) {
17200       assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
17201       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
17202       SDValue Lo, Hi;
17203       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17204                           DAG.getConstant(0, dl, MVT::i32));
17205       Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17206                           DAG.getConstant(1, dl, MVT::i32));
17207
17208       Lo = DAG.getBitcast(MVT::v32i1, Lo);
17209       Hi = DAG.getBitcast(MVT::v32i1, Hi);
17210
17211       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
17212     } else {
17213       // MaskVT require < 64bit. Truncate mask (should succeed in any case),
17214       // and bitcast.
17215       MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
17216       return DAG.getBitcast(MaskVT,
17217                             DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
17218     }
17219
17220   } else {
17221     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17222                                      Mask.getSimpleValueType().getSizeInBits());
17223     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17224     // are extracted by EXTRACT_SUBVECTOR.
17225     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17226                        DAG.getBitcast(BitcastVT, Mask),
17227                        DAG.getIntPtrConstant(0, dl));
17228   }
17229 }
17230
17231 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17232 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17233 /// necessary casting or extending for \p Mask when lowering masking intrinsics
17234 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17235                   SDValue PreservedSrc,
17236                   const X86Subtarget &Subtarget,
17237                   SelectionDAG &DAG) {
17238   MVT VT = Op.getSimpleValueType();
17239   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17240   unsigned OpcodeSelect = ISD::VSELECT;
17241   SDLoc dl(Op);
17242
17243   if (isAllOnesConstant(Mask))
17244     return Op;
17245
17246   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17247
17248   switch (Op.getOpcode()) {
17249   default: break;
17250   case X86ISD::PCMPEQM:
17251   case X86ISD::PCMPGTM:
17252   case X86ISD::CMPM:
17253   case X86ISD::CMPMU:
17254     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17255   case X86ISD::VFPCLASS:
17256     case X86ISD::VFPCLASSS:
17257     return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
17258   case X86ISD::VTRUNC:
17259   case X86ISD::VTRUNCS:
17260   case X86ISD::VTRUNCUS:
17261   case ISD::FP_TO_FP16:
17262     // We can't use ISD::VSELECT here because it is not always "Legal"
17263     // for the destination type. For example vpmovqb require only AVX512
17264     // and vselect that can operate on byte element type require BWI
17265     OpcodeSelect = X86ISD::SELECT;
17266     break;
17267   }
17268   if (PreservedSrc.isUndef())
17269     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17270   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
17271 }
17272
17273 /// \brief Creates an SDNode for a predicated scalar operation.
17274 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17275 /// The mask is coming as MVT::i8 and it should be truncated
17276 /// to MVT::i1 while lowering masking intrinsics.
17277 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17278 /// "X86select" instead of "vselect". We just can't create the "vselect" node
17279 /// for a scalar instruction.
17280 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17281                                     SDValue PreservedSrc,
17282                                     const X86Subtarget &Subtarget,
17283                                     SelectionDAG &DAG) {
17284   if (isAllOnesConstant(Mask))
17285     return Op;
17286
17287   MVT VT = Op.getSimpleValueType();
17288   SDLoc dl(Op);
17289   // The mask should be of type MVT::i1
17290   SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17291
17292   if (Op.getOpcode() == X86ISD::FSETCC)
17293     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
17294   if (Op.getOpcode() == X86ISD::VFPCLASS ||
17295       Op.getOpcode() == X86ISD::VFPCLASSS)
17296     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
17297
17298   if (PreservedSrc.isUndef())
17299     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17300   return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17301 }
17302
17303 static int getSEHRegistrationNodeSize(const Function *Fn) {
17304   if (!Fn->hasPersonalityFn())
17305     report_fatal_error(
17306         "querying registration node size for function without personality");
17307   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
17308   // WinEHStatePass for the full struct definition.
17309   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
17310   case EHPersonality::MSVC_X86SEH: return 24;
17311   case EHPersonality::MSVC_CXX: return 16;
17312   default: break;
17313   }
17314   report_fatal_error(
17315       "can only recover FP for 32-bit MSVC EH personality functions");
17316 }
17317
17318 /// When the MSVC runtime transfers control to us, either to an outlined
17319 /// function or when returning to a parent frame after catching an exception, we
17320 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
17321 /// Here's the math:
17322 ///   RegNodeBase = EntryEBP - RegNodeSize
17323 ///   ParentFP = RegNodeBase - ParentFrameOffset
17324 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
17325 /// subtracting the offset (negative on x86) takes us back to the parent FP.
17326 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
17327                                    SDValue EntryEBP) {
17328   MachineFunction &MF = DAG.getMachineFunction();
17329   SDLoc dl;
17330
17331   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17332   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
17333
17334   // It's possible that the parent function no longer has a personality function
17335   // if the exceptional code was optimized away, in which case we just return
17336   // the incoming EBP.
17337   if (!Fn->hasPersonalityFn())
17338     return EntryEBP;
17339
17340   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
17341   // registration, or the .set_setframe offset.
17342   MCSymbol *OffsetSym =
17343       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
17344           GlobalValue::getRealLinkageName(Fn->getName()));
17345   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
17346   SDValue ParentFrameOffset =
17347       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
17348
17349   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
17350   // prologue to RBP in the parent function.
17351   const X86Subtarget &Subtarget =
17352       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17353   if (Subtarget.is64Bit())
17354     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
17355
17356   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
17357   // RegNodeBase = EntryEBP - RegNodeSize
17358   // ParentFP = RegNodeBase - ParentFrameOffset
17359   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
17360                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
17361   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
17362 }
17363
17364 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
17365                                        SelectionDAG &DAG) {
17366   SDLoc dl(Op);
17367   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17368   MVT VT = Op.getSimpleValueType();
17369   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17370   if (IntrData) {
17371     switch(IntrData->Type) {
17372     case INTR_TYPE_1OP:
17373       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17374     case INTR_TYPE_2OP:
17375       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17376         Op.getOperand(2));
17377     case INTR_TYPE_2OP_IMM8:
17378       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17379                          DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
17380     case INTR_TYPE_3OP:
17381       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17382         Op.getOperand(2), Op.getOperand(3));
17383     case INTR_TYPE_4OP:
17384       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17385         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
17386     case INTR_TYPE_1OP_MASK_RM: {
17387       SDValue Src = Op.getOperand(1);
17388       SDValue PassThru = Op.getOperand(2);
17389       SDValue Mask = Op.getOperand(3);
17390       SDValue RoundingMode;
17391       // We allways add rounding mode to the Node.
17392       // If the rounding mode is not specified, we add the
17393       // "current direction" mode.
17394       if (Op.getNumOperands() == 4)
17395         RoundingMode =
17396           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17397       else
17398         RoundingMode = Op.getOperand(4);
17399       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17400       if (IntrWithRoundingModeOpcode != 0)
17401         if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
17402             X86::STATIC_ROUNDING::CUR_DIRECTION)
17403           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17404                                       dl, Op.getValueType(), Src, RoundingMode),
17405                                       Mask, PassThru, Subtarget, DAG);
17406       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17407                                               RoundingMode),
17408                                   Mask, PassThru, Subtarget, DAG);
17409     }
17410     case INTR_TYPE_1OP_MASK: {
17411       SDValue Src = Op.getOperand(1);
17412       SDValue PassThru = Op.getOperand(2);
17413       SDValue Mask = Op.getOperand(3);
17414       // We add rounding mode to the Node when
17415       //   - RM Opcode is specified and
17416       //   - RM is not "current direction".
17417       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17418       if (IntrWithRoundingModeOpcode != 0) {
17419         SDValue Rnd = Op.getOperand(4);
17420         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17421         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17422           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17423                                       dl, Op.getValueType(),
17424                                       Src, Rnd),
17425                                       Mask, PassThru, Subtarget, DAG);
17426         }
17427       }
17428       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17429                                   Mask, PassThru, Subtarget, DAG);
17430     }
17431     case INTR_TYPE_SCALAR_MASK: {
17432       SDValue Src1 = Op.getOperand(1);
17433       SDValue Src2 = Op.getOperand(2);
17434       SDValue passThru = Op.getOperand(3);
17435       SDValue Mask = Op.getOperand(4);
17436       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
17437                                   Mask, passThru, Subtarget, DAG);
17438     }
17439     case INTR_TYPE_SCALAR_MASK_RM: {
17440       SDValue Src1 = Op.getOperand(1);
17441       SDValue Src2 = Op.getOperand(2);
17442       SDValue Src0 = Op.getOperand(3);
17443       SDValue Mask = Op.getOperand(4);
17444       // There are 2 kinds of intrinsics in this group:
17445       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
17446       // (2) With rounding mode and sae - 7 operands.
17447       if (Op.getNumOperands() == 6) {
17448         SDValue Sae  = Op.getOperand(5);
17449         unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
17450         return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
17451                                                 Sae),
17452                                     Mask, Src0, Subtarget, DAG);
17453       }
17454       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
17455       SDValue RoundingMode  = Op.getOperand(5);
17456       SDValue Sae  = Op.getOperand(6);
17457       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17458                                               RoundingMode, Sae),
17459                                   Mask, Src0, Subtarget, DAG);
17460     }
17461     case INTR_TYPE_2OP_MASK:
17462     case INTR_TYPE_2OP_IMM8_MASK: {
17463       SDValue Src1 = Op.getOperand(1);
17464       SDValue Src2 = Op.getOperand(2);
17465       SDValue PassThru = Op.getOperand(3);
17466       SDValue Mask = Op.getOperand(4);
17467
17468       if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
17469         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
17470
17471       // We specify 2 possible opcodes for intrinsics with rounding modes.
17472       // First, we check if the intrinsic may have non-default rounding mode,
17473       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17474       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17475       if (IntrWithRoundingModeOpcode != 0) {
17476         SDValue Rnd = Op.getOperand(5);
17477         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17478         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17479           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17480                                       dl, Op.getValueType(),
17481                                       Src1, Src2, Rnd),
17482                                       Mask, PassThru, Subtarget, DAG);
17483         }
17484       }
17485       // TODO: Intrinsics should have fast-math-flags to propagate.
17486       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
17487                                   Mask, PassThru, Subtarget, DAG);
17488     }
17489     case INTR_TYPE_2OP_MASK_RM: {
17490       SDValue Src1 = Op.getOperand(1);
17491       SDValue Src2 = Op.getOperand(2);
17492       SDValue PassThru = Op.getOperand(3);
17493       SDValue Mask = Op.getOperand(4);
17494       // We specify 2 possible modes for intrinsics, with/without rounding
17495       // modes.
17496       // First, we check if the intrinsic have rounding mode (6 operands),
17497       // if not, we set rounding mode to "current".
17498       SDValue Rnd;
17499       if (Op.getNumOperands() == 6)
17500         Rnd = Op.getOperand(5);
17501       else
17502         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17503       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17504                                               Src1, Src2, Rnd),
17505                                   Mask, PassThru, Subtarget, DAG);
17506     }
17507     case INTR_TYPE_3OP_SCALAR_MASK_RM: {
17508       SDValue Src1 = Op.getOperand(1);
17509       SDValue Src2 = Op.getOperand(2);
17510       SDValue Src3 = Op.getOperand(3);
17511       SDValue PassThru = Op.getOperand(4);
17512       SDValue Mask = Op.getOperand(5);
17513       SDValue Sae  = Op.getOperand(6);
17514
17515       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
17516                                               Src2, Src3, Sae),
17517                                   Mask, PassThru, Subtarget, DAG);
17518     }
17519     case INTR_TYPE_3OP_MASK_RM: {
17520       SDValue Src1 = Op.getOperand(1);
17521       SDValue Src2 = Op.getOperand(2);
17522       SDValue Imm = Op.getOperand(3);
17523       SDValue PassThru = Op.getOperand(4);
17524       SDValue Mask = Op.getOperand(5);
17525       // We specify 2 possible modes for intrinsics, with/without rounding
17526       // modes.
17527       // First, we check if the intrinsic have rounding mode (7 operands),
17528       // if not, we set rounding mode to "current".
17529       SDValue Rnd;
17530       if (Op.getNumOperands() == 7)
17531         Rnd = Op.getOperand(6);
17532       else
17533         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17534       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17535         Src1, Src2, Imm, Rnd),
17536         Mask, PassThru, Subtarget, DAG);
17537     }
17538     case INTR_TYPE_3OP_IMM8_MASK:
17539     case INTR_TYPE_3OP_MASK:
17540     case INSERT_SUBVEC: {
17541       SDValue Src1 = Op.getOperand(1);
17542       SDValue Src2 = Op.getOperand(2);
17543       SDValue Src3 = Op.getOperand(3);
17544       SDValue PassThru = Op.getOperand(4);
17545       SDValue Mask = Op.getOperand(5);
17546
17547       if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
17548         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
17549       else if (IntrData->Type == INSERT_SUBVEC) {
17550         // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
17551         assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
17552         unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
17553         Imm *= Src2.getSimpleValueType().getVectorNumElements();
17554         Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
17555       }
17556
17557       // We specify 2 possible opcodes for intrinsics with rounding modes.
17558       // First, we check if the intrinsic may have non-default rounding mode,
17559       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17560       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17561       if (IntrWithRoundingModeOpcode != 0) {
17562         SDValue Rnd = Op.getOperand(6);
17563         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17564         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17565           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17566                                       dl, Op.getValueType(),
17567                                       Src1, Src2, Src3, Rnd),
17568                                       Mask, PassThru, Subtarget, DAG);
17569         }
17570       }
17571       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17572                                               Src1, Src2, Src3),
17573                                   Mask, PassThru, Subtarget, DAG);
17574     }
17575     case VPERM_2OP_MASK : {
17576       SDValue Src1 = Op.getOperand(1);
17577       SDValue Src2 = Op.getOperand(2);
17578       SDValue PassThru = Op.getOperand(3);
17579       SDValue Mask = Op.getOperand(4);
17580
17581       // Swap Src1 and Src2 in the node creation
17582       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
17583                                   Mask, PassThru, Subtarget, DAG);
17584     }
17585     case VPERM_3OP_MASKZ:
17586     case VPERM_3OP_MASK:{
17587       // Src2 is the PassThru
17588       SDValue Src1 = Op.getOperand(1);
17589       SDValue Src2 = Op.getOperand(2);
17590       SDValue Src3 = Op.getOperand(3);
17591       SDValue Mask = Op.getOperand(4);
17592       MVT VT = Op.getSimpleValueType();
17593       SDValue PassThru = SDValue();
17594
17595       // set PassThru element
17596       if (IntrData->Type == VPERM_3OP_MASKZ)
17597         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17598       else
17599         PassThru = DAG.getBitcast(VT, Src2);
17600
17601       // Swap Src1 and Src2 in the node creation
17602       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17603                                               dl, Op.getValueType(),
17604                                               Src2, Src1, Src3),
17605                                   Mask, PassThru, Subtarget, DAG);
17606     }
17607     case FMA_OP_MASK3:
17608     case FMA_OP_MASKZ:
17609     case FMA_OP_MASK: {
17610       SDValue Src1 = Op.getOperand(1);
17611       SDValue Src2 = Op.getOperand(2);
17612       SDValue Src3 = Op.getOperand(3);
17613       SDValue Mask = Op.getOperand(4);
17614       MVT VT = Op.getSimpleValueType();
17615       SDValue PassThru = SDValue();
17616
17617       // set PassThru element
17618       if (IntrData->Type == FMA_OP_MASKZ)
17619         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17620       else if (IntrData->Type == FMA_OP_MASK3)
17621         PassThru = Src3;
17622       else
17623         PassThru = Src1;
17624
17625       // We specify 2 possible opcodes for intrinsics with rounding modes.
17626       // First, we check if the intrinsic may have non-default rounding mode,
17627       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17628       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17629       if (IntrWithRoundingModeOpcode != 0) {
17630         SDValue Rnd = Op.getOperand(5);
17631         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17632             X86::STATIC_ROUNDING::CUR_DIRECTION)
17633           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17634                                                   dl, Op.getValueType(),
17635                                                   Src1, Src2, Src3, Rnd),
17636                                       Mask, PassThru, Subtarget, DAG);
17637       }
17638       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17639                                               dl, Op.getValueType(),
17640                                               Src1, Src2, Src3),
17641                                   Mask, PassThru, Subtarget, DAG);
17642     }
17643     case FMA_OP_SCALAR_MASK:
17644     case FMA_OP_SCALAR_MASK3:
17645     case FMA_OP_SCALAR_MASKZ: {
17646       SDValue Src1 = Op.getOperand(1);
17647       SDValue Src2 = Op.getOperand(2);
17648       SDValue Src3 = Op.getOperand(3);
17649       SDValue Mask = Op.getOperand(4);
17650       MVT VT = Op.getSimpleValueType();
17651       SDValue PassThru = SDValue();
17652
17653       // set PassThru element
17654       if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
17655         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17656       else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
17657         PassThru = Src3;
17658       else
17659         PassThru = Src1;
17660
17661       SDValue Rnd = Op.getOperand(5);
17662       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
17663                                               Op.getValueType(), Src1, Src2,
17664                                               Src3, Rnd),
17665                                   Mask, PassThru, Subtarget, DAG);
17666     }
17667     case TERLOG_OP_MASK:
17668     case TERLOG_OP_MASKZ: {
17669       SDValue Src1 = Op.getOperand(1);
17670       SDValue Src2 = Op.getOperand(2);
17671       SDValue Src3 = Op.getOperand(3);
17672       SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
17673       SDValue Mask = Op.getOperand(5);
17674       MVT VT = Op.getSimpleValueType();
17675       SDValue PassThru = Src1;
17676       // Set PassThru element.
17677       if (IntrData->Type == TERLOG_OP_MASKZ)
17678         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17679
17680       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17681                                               Src1, Src2, Src3, Src4),
17682                                   Mask, PassThru, Subtarget, DAG);
17683     }
17684     case FPCLASS: {
17685       // FPclass intrinsics with mask
17686        SDValue Src1 = Op.getOperand(1);
17687        MVT VT = Src1.getSimpleValueType();
17688        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17689        SDValue Imm = Op.getOperand(2);
17690        SDValue Mask = Op.getOperand(3);
17691        MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17692                                      Mask.getSimpleValueType().getSizeInBits());
17693        SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
17694        SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
17695                                                  DAG.getTargetConstant(0, dl, MaskVT),
17696                                                  Subtarget, DAG);
17697        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17698                                  DAG.getUNDEF(BitcastVT), FPclassMask,
17699                                  DAG.getIntPtrConstant(0, dl));
17700        return DAG.getBitcast(Op.getValueType(), Res);
17701     }
17702     case FPCLASSS: {
17703       SDValue Src1 = Op.getOperand(1);
17704       SDValue Imm = Op.getOperand(2);
17705       SDValue Mask = Op.getOperand(3);
17706       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
17707       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
17708         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
17709       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
17710     }
17711     case CMP_MASK:
17712     case CMP_MASK_CC: {
17713       // Comparison intrinsics with masks.
17714       // Example of transformation:
17715       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17716       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17717       // (i8 (bitcast
17718       //   (v8i1 (insert_subvector undef,
17719       //           (v2i1 (and (PCMPEQM %a, %b),
17720       //                      (extract_subvector
17721       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17722       MVT VT = Op.getOperand(1).getSimpleValueType();
17723       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17724       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17725       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17726                                        Mask.getSimpleValueType().getSizeInBits());
17727       SDValue Cmp;
17728       if (IntrData->Type == CMP_MASK_CC) {
17729         SDValue CC = Op.getOperand(3);
17730         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
17731         // We specify 2 possible opcodes for intrinsics with rounding modes.
17732         // First, we check if the intrinsic may have non-default rounding mode,
17733         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17734         if (IntrData->Opc1 != 0) {
17735           SDValue Rnd = Op.getOperand(5);
17736           if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17737               X86::STATIC_ROUNDING::CUR_DIRECTION)
17738             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
17739                               Op.getOperand(2), CC, Rnd);
17740         }
17741         //default rounding mode
17742         if(!Cmp.getNode())
17743             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17744                               Op.getOperand(2), CC);
17745
17746       } else {
17747         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17748         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17749                           Op.getOperand(2));
17750       }
17751       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17752                                              DAG.getTargetConstant(0, dl,
17753                                                                    MaskVT),
17754                                              Subtarget, DAG);
17755       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17756                                 DAG.getUNDEF(BitcastVT), CmpMask,
17757                                 DAG.getIntPtrConstant(0, dl));
17758       return DAG.getBitcast(Op.getValueType(), Res);
17759     }
17760     case CMP_MASK_SCALAR_CC: {
17761       SDValue Src1 = Op.getOperand(1);
17762       SDValue Src2 = Op.getOperand(2);
17763       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
17764       SDValue Mask = Op.getOperand(4);
17765
17766       SDValue Cmp;
17767       if (IntrData->Opc1 != 0) {
17768         SDValue Rnd = Op.getOperand(5);
17769         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17770             X86::STATIC_ROUNDING::CUR_DIRECTION)
17771           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
17772       }
17773       //default rounding mode
17774       if(!Cmp.getNode())
17775         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
17776
17777       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
17778                                              DAG.getTargetConstant(0, dl,
17779                                                                    MVT::i1),
17780                                              Subtarget, DAG);
17781
17782       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
17783     }
17784     case COMI: { // Comparison intrinsics
17785       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17786       SDValue LHS = Op.getOperand(1);
17787       SDValue RHS = Op.getOperand(2);
17788       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17789       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
17790       SDValue SetCC;
17791       switch (CC) {
17792       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
17793         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17794                             DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
17795         SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17796                                     DAG.getConstant(X86::COND_NP, dl, MVT::i8),
17797                                     Comi);
17798         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
17799         break;
17800       }
17801       case ISD::SETNE: { // (ZF = 1 or PF = 1)
17802         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17803                             DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
17804         SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17805                                    DAG.getConstant(X86::COND_P, dl, MVT::i8),
17806                                    Comi);
17807         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
17808         break;
17809       }
17810       case ISD::SETGT: // (CF = 0 and ZF = 0)
17811         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17812                             DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
17813         break;
17814       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
17815         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17816                             DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
17817         break;
17818       }
17819       case ISD::SETGE: // CF = 0
17820         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17821                             DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
17822         break;
17823       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
17824         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17825                             DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
17826         break;
17827       default:
17828         llvm_unreachable("Unexpected illegal condition!");
17829       }
17830       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17831     }
17832     case COMI_RM: { // Comparison intrinsics with Sae
17833       SDValue LHS = Op.getOperand(1);
17834       SDValue RHS = Op.getOperand(2);
17835       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
17836       SDValue Sae = Op.getOperand(4);
17837
17838       SDValue FCmp;
17839       if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
17840           X86::STATIC_ROUNDING::CUR_DIRECTION)
17841         FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17842                                   DAG.getConstant(CondVal, dl, MVT::i8));
17843       else
17844         FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17845                                   DAG.getConstant(CondVal, dl, MVT::i8), Sae);
17846       // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
17847       return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
17848     }
17849     case VSHIFT:
17850       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17851                                  Op.getOperand(1), Op.getOperand(2), DAG);
17852     case COMPRESS_EXPAND_IN_REG: {
17853       SDValue Mask = Op.getOperand(3);
17854       SDValue DataToCompress = Op.getOperand(1);
17855       SDValue PassThru = Op.getOperand(2);
17856       if (isAllOnesConstant(Mask)) // return data as is
17857         return Op.getOperand(1);
17858
17859       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17860                                               DataToCompress),
17861                                   Mask, PassThru, Subtarget, DAG);
17862     }
17863     case BROADCASTM: {
17864       SDValue Mask = Op.getOperand(1);
17865       MVT MaskVT = MVT::getVectorVT(MVT::i1,
17866                                     Mask.getSimpleValueType().getSizeInBits());
17867       Mask = DAG.getBitcast(MaskVT, Mask);
17868       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
17869     }
17870     case KUNPCK: {
17871       MVT VT = Op.getSimpleValueType();
17872       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
17873
17874       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
17875       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
17876       // Arguments should be swapped.
17877       SDValue Res = DAG.getNode(IntrData->Opc0, dl,
17878                                 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
17879                                 Src2, Src1);
17880       return DAG.getBitcast(VT, Res);
17881     }
17882     case FIXUPIMMS:
17883     case FIXUPIMMS_MASKZ:
17884     case FIXUPIMM:
17885     case FIXUPIMM_MASKZ:{
17886       SDValue Src1 = Op.getOperand(1);
17887       SDValue Src2 = Op.getOperand(2);
17888       SDValue Src3 = Op.getOperand(3);
17889       SDValue Imm = Op.getOperand(4);
17890       SDValue Mask = Op.getOperand(5);
17891       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
17892                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
17893       // We specify 2 possible modes for intrinsics, with/without rounding
17894       // modes.
17895       // First, we check if the intrinsic have rounding mode (7 operands),
17896       // if not, we set rounding mode to "current".
17897       SDValue Rnd;
17898       if (Op.getNumOperands() == 7)
17899         Rnd = Op.getOperand(6);
17900       else
17901         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17902       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
17903         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17904                                                 Src1, Src2, Src3, Imm, Rnd),
17905                                     Mask, Passthru, Subtarget, DAG);
17906       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
17907         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17908                                        Src1, Src2, Src3, Imm, Rnd),
17909                                     Mask, Passthru, Subtarget, DAG);
17910     }
17911     case CONVERT_TO_MASK: {
17912       MVT SrcVT = Op.getOperand(1).getSimpleValueType();
17913       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
17914       MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
17915
17916       SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
17917                                     Op.getOperand(1));
17918       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17919                                 DAG.getUNDEF(BitcastVT), CvtMask,
17920                                 DAG.getIntPtrConstant(0, dl));
17921       return DAG.getBitcast(Op.getValueType(), Res);
17922     }
17923     case CONVERT_MASK_TO_VEC: {
17924       SDValue Mask = Op.getOperand(1);
17925       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17926       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17927       return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
17928     }
17929     case BRCST_SUBVEC_TO_VEC: {
17930       SDValue Src = Op.getOperand(1);
17931       SDValue Passthru = Op.getOperand(2);
17932       SDValue Mask = Op.getOperand(3);
17933       EVT resVT = Passthru.getValueType();
17934       SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
17935                                        DAG.getUNDEF(resVT), Src,
17936                                        DAG.getIntPtrConstant(0, dl));
17937       SDValue immVal;
17938       if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
17939         immVal = DAG.getConstant(0x44, dl, MVT::i8);
17940       else
17941         immVal = DAG.getConstant(0, dl, MVT::i8);
17942       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17943                                               subVec, subVec, immVal),
17944                                   Mask, Passthru, Subtarget, DAG);
17945     }
17946     case BRCST32x2_TO_VEC: {
17947       SDValue Src = Op.getOperand(1);
17948       SDValue PassThru = Op.getOperand(2);
17949       SDValue Mask = Op.getOperand(3);
17950
17951       assert((VT.getScalarType() == MVT::i32 ||
17952               VT.getScalarType() == MVT::f32) && "Unexpected type!");
17953       //bitcast Src to packed 64
17954       MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
17955       MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
17956       Src = DAG.getBitcast(BitcastVT, Src);
17957
17958       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17959                                   Mask, PassThru, Subtarget, DAG);
17960     }
17961     default:
17962       break;
17963     }
17964   }
17965
17966   switch (IntNo) {
17967   default: return SDValue();    // Don't custom lower most intrinsics.
17968
17969   case Intrinsic::x86_avx2_permd:
17970   case Intrinsic::x86_avx2_permps:
17971     // Operands intentionally swapped. Mask is last operand to intrinsic,
17972     // but second operand for node/instruction.
17973     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
17974                        Op.getOperand(2), Op.getOperand(1));
17975
17976   // ptest and testp intrinsics. The intrinsic these come from are designed to
17977   // return an integer value, not just an instruction so lower it to the ptest
17978   // or testp pattern and a setcc for the result.
17979   case Intrinsic::x86_sse41_ptestz:
17980   case Intrinsic::x86_sse41_ptestc:
17981   case Intrinsic::x86_sse41_ptestnzc:
17982   case Intrinsic::x86_avx_ptestz_256:
17983   case Intrinsic::x86_avx_ptestc_256:
17984   case Intrinsic::x86_avx_ptestnzc_256:
17985   case Intrinsic::x86_avx_vtestz_ps:
17986   case Intrinsic::x86_avx_vtestc_ps:
17987   case Intrinsic::x86_avx_vtestnzc_ps:
17988   case Intrinsic::x86_avx_vtestz_pd:
17989   case Intrinsic::x86_avx_vtestc_pd:
17990   case Intrinsic::x86_avx_vtestnzc_pd:
17991   case Intrinsic::x86_avx_vtestz_ps_256:
17992   case Intrinsic::x86_avx_vtestc_ps_256:
17993   case Intrinsic::x86_avx_vtestnzc_ps_256:
17994   case Intrinsic::x86_avx_vtestz_pd_256:
17995   case Intrinsic::x86_avx_vtestc_pd_256:
17996   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17997     bool IsTestPacked = false;
17998     unsigned X86CC;
17999     switch (IntNo) {
18000     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
18001     case Intrinsic::x86_avx_vtestz_ps:
18002     case Intrinsic::x86_avx_vtestz_pd:
18003     case Intrinsic::x86_avx_vtestz_ps_256:
18004     case Intrinsic::x86_avx_vtestz_pd_256:
18005       IsTestPacked = true; // Fallthrough
18006     case Intrinsic::x86_sse41_ptestz:
18007     case Intrinsic::x86_avx_ptestz_256:
18008       // ZF = 1
18009       X86CC = X86::COND_E;
18010       break;
18011     case Intrinsic::x86_avx_vtestc_ps:
18012     case Intrinsic::x86_avx_vtestc_pd:
18013     case Intrinsic::x86_avx_vtestc_ps_256:
18014     case Intrinsic::x86_avx_vtestc_pd_256:
18015       IsTestPacked = true; // Fallthrough
18016     case Intrinsic::x86_sse41_ptestc:
18017     case Intrinsic::x86_avx_ptestc_256:
18018       // CF = 1
18019       X86CC = X86::COND_B;
18020       break;
18021     case Intrinsic::x86_avx_vtestnzc_ps:
18022     case Intrinsic::x86_avx_vtestnzc_pd:
18023     case Intrinsic::x86_avx_vtestnzc_ps_256:
18024     case Intrinsic::x86_avx_vtestnzc_pd_256:
18025       IsTestPacked = true; // Fallthrough
18026     case Intrinsic::x86_sse41_ptestnzc:
18027     case Intrinsic::x86_avx_ptestnzc_256:
18028       // ZF and CF = 0
18029       X86CC = X86::COND_A;
18030       break;
18031     }
18032
18033     SDValue LHS = Op.getOperand(1);
18034     SDValue RHS = Op.getOperand(2);
18035     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
18036     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
18037     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18038     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18039     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18040   }
18041   case Intrinsic::x86_avx512_kortestz_w:
18042   case Intrinsic::x86_avx512_kortestc_w: {
18043     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
18044     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
18045     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
18046     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18047     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18048     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18049     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18050   }
18051
18052   case Intrinsic::x86_sse42_pcmpistria128:
18053   case Intrinsic::x86_sse42_pcmpestria128:
18054   case Intrinsic::x86_sse42_pcmpistric128:
18055   case Intrinsic::x86_sse42_pcmpestric128:
18056   case Intrinsic::x86_sse42_pcmpistrio128:
18057   case Intrinsic::x86_sse42_pcmpestrio128:
18058   case Intrinsic::x86_sse42_pcmpistris128:
18059   case Intrinsic::x86_sse42_pcmpestris128:
18060   case Intrinsic::x86_sse42_pcmpistriz128:
18061   case Intrinsic::x86_sse42_pcmpestriz128: {
18062     unsigned Opcode;
18063     unsigned X86CC;
18064     switch (IntNo) {
18065     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
18066     case Intrinsic::x86_sse42_pcmpistria128:
18067       Opcode = X86ISD::PCMPISTRI;
18068       X86CC = X86::COND_A;
18069       break;
18070     case Intrinsic::x86_sse42_pcmpestria128:
18071       Opcode = X86ISD::PCMPESTRI;
18072       X86CC = X86::COND_A;
18073       break;
18074     case Intrinsic::x86_sse42_pcmpistric128:
18075       Opcode = X86ISD::PCMPISTRI;
18076       X86CC = X86::COND_B;
18077       break;
18078     case Intrinsic::x86_sse42_pcmpestric128:
18079       Opcode = X86ISD::PCMPESTRI;
18080       X86CC = X86::COND_B;
18081       break;
18082     case Intrinsic::x86_sse42_pcmpistrio128:
18083       Opcode = X86ISD::PCMPISTRI;
18084       X86CC = X86::COND_O;
18085       break;
18086     case Intrinsic::x86_sse42_pcmpestrio128:
18087       Opcode = X86ISD::PCMPESTRI;
18088       X86CC = X86::COND_O;
18089       break;
18090     case Intrinsic::x86_sse42_pcmpistris128:
18091       Opcode = X86ISD::PCMPISTRI;
18092       X86CC = X86::COND_S;
18093       break;
18094     case Intrinsic::x86_sse42_pcmpestris128:
18095       Opcode = X86ISD::PCMPESTRI;
18096       X86CC = X86::COND_S;
18097       break;
18098     case Intrinsic::x86_sse42_pcmpistriz128:
18099       Opcode = X86ISD::PCMPISTRI;
18100       X86CC = X86::COND_E;
18101       break;
18102     case Intrinsic::x86_sse42_pcmpestriz128:
18103       Opcode = X86ISD::PCMPESTRI;
18104       X86CC = X86::COND_E;
18105       break;
18106     }
18107     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18108     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18109     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
18110     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18111                                 DAG.getConstant(X86CC, dl, MVT::i8),
18112                                 SDValue(PCMP.getNode(), 1));
18113     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18114   }
18115
18116   case Intrinsic::x86_sse42_pcmpistri128:
18117   case Intrinsic::x86_sse42_pcmpestri128: {
18118     unsigned Opcode;
18119     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
18120       Opcode = X86ISD::PCMPISTRI;
18121     else
18122       Opcode = X86ISD::PCMPESTRI;
18123
18124     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18125     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18126     return DAG.getNode(Opcode, dl, VTs, NewOps);
18127   }
18128
18129   case Intrinsic::eh_sjlj_lsda: {
18130     MachineFunction &MF = DAG.getMachineFunction();
18131     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18132     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18133     auto &Context = MF.getMMI().getContext();
18134     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
18135                                             Twine(MF.getFunctionNumber()));
18136     return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
18137   }
18138
18139   case Intrinsic::x86_seh_lsda: {
18140     // Compute the symbol for the LSDA. We know it'll get emitted later.
18141     MachineFunction &MF = DAG.getMachineFunction();
18142     SDValue Op1 = Op.getOperand(1);
18143     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
18144     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
18145         GlobalValue::getRealLinkageName(Fn->getName()));
18146
18147     // Generate a simple absolute symbol reference. This intrinsic is only
18148     // supported on 32-bit Windows, which isn't PIC.
18149     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
18150     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
18151   }
18152
18153   case Intrinsic::x86_seh_recoverfp: {
18154     SDValue FnOp = Op.getOperand(1);
18155     SDValue IncomingFPOp = Op.getOperand(2);
18156     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
18157     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
18158     if (!Fn)
18159       report_fatal_error(
18160           "llvm.x86.seh.recoverfp must take a function as the first argument");
18161     return recoverFramePointer(DAG, Fn, IncomingFPOp);
18162   }
18163
18164   case Intrinsic::localaddress: {
18165     // Returns one of the stack, base, or frame pointer registers, depending on
18166     // which is used to reference local variables.
18167     MachineFunction &MF = DAG.getMachineFunction();
18168     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18169     unsigned Reg;
18170     if (RegInfo->hasBasePointer(MF))
18171       Reg = RegInfo->getBaseRegister();
18172     else // This function handles the SP or FP case.
18173       Reg = RegInfo->getPtrSizedFrameRegister(MF);
18174     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
18175   }
18176   }
18177 }
18178
18179 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18180                               SDValue Src, SDValue Mask, SDValue Base,
18181                               SDValue Index, SDValue ScaleOp, SDValue Chain,
18182                               const X86Subtarget &Subtarget) {
18183   SDLoc dl(Op);
18184   auto *C = cast<ConstantSDNode>(ScaleOp);
18185   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18186   MVT MaskVT = MVT::getVectorVT(MVT::i1,
18187                              Index.getSimpleValueType().getVectorNumElements());
18188
18189   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18190   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
18191   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18192   SDValue Segment = DAG.getRegister(0, MVT::i32);
18193   if (Src.isUndef())
18194     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
18195   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
18196   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18197   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
18198   return DAG.getMergeValues(RetOps, dl);
18199 }
18200
18201 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18202                                SDValue Src, SDValue Mask, SDValue Base,
18203                                SDValue Index, SDValue ScaleOp, SDValue Chain,
18204                                const X86Subtarget &Subtarget) {
18205   SDLoc dl(Op);
18206   auto *C = cast<ConstantSDNode>(ScaleOp);
18207   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18208   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18209   SDValue Segment = DAG.getRegister(0, MVT::i32);
18210   MVT MaskVT = MVT::getVectorVT(MVT::i1,
18211                              Index.getSimpleValueType().getVectorNumElements());
18212
18213   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18214   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
18215   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
18216   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18217   return SDValue(Res, 1);
18218 }
18219
18220 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18221                                SDValue Mask, SDValue Base, SDValue Index,
18222                                SDValue ScaleOp, SDValue Chain,
18223                                const X86Subtarget &Subtarget) {
18224   SDLoc dl(Op);
18225   auto *C = cast<ConstantSDNode>(ScaleOp);
18226   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18227   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18228   SDValue Segment = DAG.getRegister(0, MVT::i32);
18229   MVT MaskVT =
18230     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
18231   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18232   //SDVTList VTs = DAG.getVTList(MVT::Other);
18233   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
18234   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
18235   return SDValue(Res, 0);
18236 }
18237
18238 /// Handles the lowering of builtin intrinsics that read performance monitor
18239 /// counters (x86_rdpmc).
18240 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
18241                                       SelectionDAG &DAG,
18242                                       const X86Subtarget &Subtarget,
18243                                       SmallVectorImpl<SDValue> &Results) {
18244   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18245   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18246   SDValue LO, HI;
18247
18248   // The ECX register is used to select the index of the performance counter
18249   // to read.
18250   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
18251                                    N->getOperand(2));
18252   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
18253
18254   // Reads the content of a 64-bit performance counter and returns it in the
18255   // registers EDX:EAX.
18256   if (Subtarget.is64Bit()) {
18257     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18258     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18259                             LO.getValue(2));
18260   } else {
18261     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18262     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18263                             LO.getValue(2));
18264   }
18265   Chain = HI.getValue(1);
18266
18267   if (Subtarget.is64Bit()) {
18268     // The EAX register is loaded with the low-order 32 bits. The EDX register
18269     // is loaded with the supported high-order bits of the counter.
18270     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18271                               DAG.getConstant(32, DL, MVT::i8));
18272     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18273     Results.push_back(Chain);
18274     return;
18275   }
18276
18277   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18278   SDValue Ops[] = { LO, HI };
18279   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18280   Results.push_back(Pair);
18281   Results.push_back(Chain);
18282 }
18283
18284 /// Handles the lowering of builtin intrinsics that read the time stamp counter
18285 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
18286 /// READCYCLECOUNTER nodes.
18287 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
18288                                     SelectionDAG &DAG,
18289                                     const X86Subtarget &Subtarget,
18290                                     SmallVectorImpl<SDValue> &Results) {
18291   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18292   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
18293   SDValue LO, HI;
18294
18295   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
18296   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
18297   // and the EAX register is loaded with the low-order 32 bits.
18298   if (Subtarget.is64Bit()) {
18299     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18300     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18301                             LO.getValue(2));
18302   } else {
18303     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18304     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18305                             LO.getValue(2));
18306   }
18307   SDValue Chain = HI.getValue(1);
18308
18309   if (Opcode == X86ISD::RDTSCP_DAG) {
18310     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18311
18312     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
18313     // the ECX register. Add 'ecx' explicitly to the chain.
18314     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
18315                                      HI.getValue(2));
18316     // Explicitly store the content of ECX at the location passed in input
18317     // to the 'rdtscp' intrinsic.
18318     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
18319                          MachinePointerInfo());
18320   }
18321
18322   if (Subtarget.is64Bit()) {
18323     // The EDX register is loaded with the high-order 32 bits of the MSR, and
18324     // the EAX register is loaded with the low-order 32 bits.
18325     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18326                               DAG.getConstant(32, DL, MVT::i8));
18327     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18328     Results.push_back(Chain);
18329     return;
18330   }
18331
18332   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18333   SDValue Ops[] = { LO, HI };
18334   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18335   Results.push_back(Pair);
18336   Results.push_back(Chain);
18337 }
18338
18339 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
18340                                      SelectionDAG &DAG) {
18341   SmallVector<SDValue, 2> Results;
18342   SDLoc DL(Op);
18343   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
18344                           Results);
18345   return DAG.getMergeValues(Results, DL);
18346 }
18347
18348 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
18349   MachineFunction &MF = DAG.getMachineFunction();
18350   SDValue Chain = Op.getOperand(0);
18351   SDValue RegNode = Op.getOperand(2);
18352   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18353   if (!EHInfo)
18354     report_fatal_error("EH registrations only live in functions using WinEH");
18355
18356   // Cast the operand to an alloca, and remember the frame index.
18357   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
18358   if (!FINode)
18359     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
18360   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
18361
18362   // Return the chain operand without making any DAG nodes.
18363   return Chain;
18364 }
18365
18366 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
18367   MachineFunction &MF = DAG.getMachineFunction();
18368   SDValue Chain = Op.getOperand(0);
18369   SDValue EHGuard = Op.getOperand(2);
18370   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18371   if (!EHInfo)
18372     report_fatal_error("EHGuard only live in functions using WinEH");
18373
18374   // Cast the operand to an alloca, and remember the frame index.
18375   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
18376   if (!FINode)
18377     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
18378   EHInfo->EHGuardFrameIndex = FINode->getIndex();
18379
18380   // Return the chain operand without making any DAG nodes.
18381   return Chain;
18382 }
18383
18384 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18385                                       SelectionDAG &DAG) {
18386   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
18387
18388   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
18389   if (!IntrData) {
18390     if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
18391       return MarkEHRegistrationNode(Op, DAG);
18392     if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
18393       return MarkEHGuard(Op, DAG);
18394     if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
18395         IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
18396         IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
18397         IntNo == llvm::Intrinsic::x86_flags_write_u64) {
18398       // We need a frame pointer because this will get lowered to a PUSH/POP
18399       // sequence.
18400       MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18401       MFI->setHasCopyImplyingStackAdjustment(true);
18402       // Don't do anything here, we will expand these intrinsics out later
18403       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
18404       return SDValue();
18405     }
18406     return SDValue();
18407   }
18408
18409   SDLoc dl(Op);
18410   switch(IntrData->Type) {
18411   default: llvm_unreachable("Unknown Intrinsic Type");
18412   case RDSEED:
18413   case RDRAND: {
18414     // Emit the node with the right value type.
18415     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
18416     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18417
18418     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
18419     // Otherwise return the value from Rand, which is always 0, casted to i32.
18420     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
18421                       DAG.getConstant(1, dl, Op->getValueType(1)),
18422                       DAG.getConstant(X86::COND_B, dl, MVT::i32),
18423                       SDValue(Result.getNode(), 1) };
18424     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
18425                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
18426                                   Ops);
18427
18428     // Return { result, isValid, chain }.
18429     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
18430                        SDValue(Result.getNode(), 2));
18431   }
18432   case GATHER: {
18433   //gather(v1, mask, index, base, scale);
18434     SDValue Chain = Op.getOperand(0);
18435     SDValue Src   = Op.getOperand(2);
18436     SDValue Base  = Op.getOperand(3);
18437     SDValue Index = Op.getOperand(4);
18438     SDValue Mask  = Op.getOperand(5);
18439     SDValue Scale = Op.getOperand(6);
18440     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
18441                          Chain, Subtarget);
18442   }
18443   case SCATTER: {
18444   //scatter(base, mask, index, v1, scale);
18445     SDValue Chain = Op.getOperand(0);
18446     SDValue Base  = Op.getOperand(2);
18447     SDValue Mask  = Op.getOperand(3);
18448     SDValue Index = Op.getOperand(4);
18449     SDValue Src   = Op.getOperand(5);
18450     SDValue Scale = Op.getOperand(6);
18451     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
18452                           Scale, Chain, Subtarget);
18453   }
18454   case PREFETCH: {
18455     SDValue Hint = Op.getOperand(6);
18456     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
18457     assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
18458     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
18459     SDValue Chain = Op.getOperand(0);
18460     SDValue Mask  = Op.getOperand(2);
18461     SDValue Index = Op.getOperand(3);
18462     SDValue Base  = Op.getOperand(4);
18463     SDValue Scale = Op.getOperand(5);
18464     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
18465                            Subtarget);
18466   }
18467   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
18468   case RDTSC: {
18469     SmallVector<SDValue, 2> Results;
18470     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
18471                             Results);
18472     return DAG.getMergeValues(Results, dl);
18473   }
18474   // Read Performance Monitoring Counters.
18475   case RDPMC: {
18476     SmallVector<SDValue, 2> Results;
18477     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
18478     return DAG.getMergeValues(Results, dl);
18479   }
18480   // XTEST intrinsics.
18481   case XTEST: {
18482     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18483     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18484     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18485                                 DAG.getConstant(X86::COND_NE, dl, MVT::i8),
18486                                 InTrans);
18487     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
18488     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
18489                        Ret, SDValue(InTrans.getNode(), 1));
18490   }
18491   // ADC/ADCX/SBB
18492   case ADX: {
18493     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18494     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
18495     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
18496                                 DAG.getConstant(-1, dl, MVT::i8));
18497     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
18498                               Op.getOperand(4), GenCF.getValue(1));
18499     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
18500                                  Op.getOperand(5), MachinePointerInfo());
18501     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18502                                 DAG.getConstant(X86::COND_B, dl, MVT::i8),
18503                                 Res.getValue(1));
18504     SDValue Results[] = { SetCC, Store };
18505     return DAG.getMergeValues(Results, dl);
18506   }
18507   case COMPRESS_TO_MEM: {
18508     SDValue Mask = Op.getOperand(4);
18509     SDValue DataToCompress = Op.getOperand(3);
18510     SDValue Addr = Op.getOperand(2);
18511     SDValue Chain = Op.getOperand(0);
18512     MVT VT = DataToCompress.getSimpleValueType();
18513
18514     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18515     assert(MemIntr && "Expected MemIntrinsicSDNode!");
18516
18517     if (isAllOnesConstant(Mask)) // return just a store
18518       return DAG.getStore(Chain, dl, DataToCompress, Addr,
18519                           MemIntr->getMemOperand());
18520
18521     SDValue Compressed =
18522       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
18523                            Mask, DAG.getUNDEF(VT), Subtarget, DAG);
18524     return DAG.getStore(Chain, dl, Compressed, Addr,
18525                         MemIntr->getMemOperand());
18526   }
18527   case TRUNCATE_TO_MEM_VI8:
18528   case TRUNCATE_TO_MEM_VI16:
18529   case TRUNCATE_TO_MEM_VI32: {
18530     SDValue Mask = Op.getOperand(4);
18531     SDValue DataToTruncate = Op.getOperand(3);
18532     SDValue Addr = Op.getOperand(2);
18533     SDValue Chain = Op.getOperand(0);
18534
18535     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18536     assert(MemIntr && "Expected MemIntrinsicSDNode!");
18537
18538     EVT VT  = MemIntr->getMemoryVT();
18539
18540     if (isAllOnesConstant(Mask)) // return just a truncate store
18541       return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
18542                                MemIntr->getMemOperand());
18543
18544     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18545     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18546
18547     return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
18548                               MemIntr->getMemOperand(), true);
18549   }
18550   case EXPAND_FROM_MEM: {
18551     SDValue Mask = Op.getOperand(4);
18552     SDValue PassThru = Op.getOperand(3);
18553     SDValue Addr = Op.getOperand(2);
18554     SDValue Chain = Op.getOperand(0);
18555     MVT VT = Op.getSimpleValueType();
18556
18557     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18558     assert(MemIntr && "Expected MemIntrinsicSDNode!");
18559
18560     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
18561                                        MemIntr->getMemOperand());
18562
18563     if (isAllOnesConstant(Mask)) // return just a load
18564       return DataToExpand;
18565
18566     SDValue Results[] = {
18567       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
18568                            Mask, PassThru, Subtarget, DAG), Chain};
18569     return DAG.getMergeValues(Results, dl);
18570   }
18571   }
18572 }
18573
18574 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
18575                                            SelectionDAG &DAG) const {
18576   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18577   MFI->setReturnAddressIsTaken(true);
18578
18579   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
18580     return SDValue();
18581
18582   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18583   SDLoc dl(Op);
18584   EVT PtrVT = getPointerTy(DAG.getDataLayout());
18585
18586   if (Depth > 0) {
18587     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
18588     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18589     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
18590     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18591                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
18592                        MachinePointerInfo());
18593   }
18594
18595   // Just load the return address.
18596   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
18597   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
18598                      MachinePointerInfo());
18599 }
18600
18601 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
18602   MachineFunction &MF = DAG.getMachineFunction();
18603   MachineFrameInfo *MFI = MF.getFrameInfo();
18604   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18605   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18606   EVT VT = Op.getValueType();
18607
18608   MFI->setFrameAddressIsTaken(true);
18609
18610   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
18611     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
18612     // is not possible to crawl up the stack without looking at the unwind codes
18613     // simultaneously.
18614     int FrameAddrIndex = FuncInfo->getFAIndex();
18615     if (!FrameAddrIndex) {
18616       // Set up a frame object for the return address.
18617       unsigned SlotSize = RegInfo->getSlotSize();
18618       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
18619           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
18620       FuncInfo->setFAIndex(FrameAddrIndex);
18621     }
18622     return DAG.getFrameIndex(FrameAddrIndex, VT);
18623   }
18624
18625   unsigned FrameReg =
18626       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18627   SDLoc dl(Op);  // FIXME probably not meaningful
18628   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18629   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
18630           (FrameReg == X86::EBP && VT == MVT::i32)) &&
18631          "Invalid Frame Register!");
18632   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
18633   while (Depth--)
18634     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
18635                             MachinePointerInfo());
18636   return FrameAddr;
18637 }
18638
18639 // FIXME? Maybe this could be a TableGen attribute on some registers and
18640 // this table could be generated automatically from RegInfo.
18641 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
18642                                               SelectionDAG &DAG) const {
18643   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18644   const MachineFunction &MF = DAG.getMachineFunction();
18645
18646   unsigned Reg = StringSwitch<unsigned>(RegName)
18647                        .Case("esp", X86::ESP)
18648                        .Case("rsp", X86::RSP)
18649                        .Case("ebp", X86::EBP)
18650                        .Case("rbp", X86::RBP)
18651                        .Default(0);
18652
18653   if (Reg == X86::EBP || Reg == X86::RBP) {
18654     if (!TFI.hasFP(MF))
18655       report_fatal_error("register " + StringRef(RegName) +
18656                          " is allocatable: function has no frame pointer");
18657 #ifndef NDEBUG
18658     else {
18659       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18660       unsigned FrameReg =
18661           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18662       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
18663              "Invalid Frame Register!");
18664     }
18665 #endif
18666   }
18667
18668   if (Reg)
18669     return Reg;
18670
18671   report_fatal_error("Invalid register name global variable");
18672 }
18673
18674 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18675                                                      SelectionDAG &DAG) const {
18676   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18677   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
18678 }
18679
18680 unsigned X86TargetLowering::getExceptionPointerRegister(
18681     const Constant *PersonalityFn) const {
18682   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
18683     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18684
18685   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
18686 }
18687
18688 unsigned X86TargetLowering::getExceptionSelectorRegister(
18689     const Constant *PersonalityFn) const {
18690   // Funclet personalities don't use selectors (the runtime does the selection).
18691   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
18692   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18693 }
18694
18695 bool X86TargetLowering::needsFixedCatchObjects() const {
18696   return Subtarget.isTargetWin64();
18697 }
18698
18699 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18700   SDValue Chain     = Op.getOperand(0);
18701   SDValue Offset    = Op.getOperand(1);
18702   SDValue Handler   = Op.getOperand(2);
18703   SDLoc dl      (Op);
18704
18705   EVT PtrVT = getPointerTy(DAG.getDataLayout());
18706   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18707   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18708   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18709           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18710          "Invalid Frame Register!");
18711   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18712   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18713
18714   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18715                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
18716                                                        dl));
18717   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18718   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
18719   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18720
18721   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18722                      DAG.getRegister(StoreAddrReg, PtrVT));
18723 }
18724
18725 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18726                                                SelectionDAG &DAG) const {
18727   SDLoc DL(Op);
18728   // If the subtarget is not 64bit, we may need the global base reg
18729   // after isel expand pseudo, i.e., after CGBR pass ran.
18730   // Therefore, ask for the GlobalBaseReg now, so that the pass
18731   // inserts the code for us in case we need it.
18732   // Otherwise, we will end up in a situation where we will
18733   // reference a virtual register that is not defined!
18734   if (!Subtarget.is64Bit()) {
18735     const X86InstrInfo *TII = Subtarget.getInstrInfo();
18736     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
18737   }
18738   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18739                      DAG.getVTList(MVT::i32, MVT::Other),
18740                      Op.getOperand(0), Op.getOperand(1));
18741 }
18742
18743 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18744                                                 SelectionDAG &DAG) const {
18745   SDLoc DL(Op);
18746   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18747                      Op.getOperand(0), Op.getOperand(1));
18748 }
18749
18750 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
18751                                                        SelectionDAG &DAG) const {
18752   SDLoc DL(Op);
18753   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
18754                      Op.getOperand(0));
18755 }
18756
18757 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18758   return Op.getOperand(0);
18759 }
18760
18761 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18762                                                 SelectionDAG &DAG) const {
18763   SDValue Root = Op.getOperand(0);
18764   SDValue Trmp = Op.getOperand(1); // trampoline
18765   SDValue FPtr = Op.getOperand(2); // nested function
18766   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18767   SDLoc dl (Op);
18768
18769   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18770   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
18771
18772   if (Subtarget.is64Bit()) {
18773     SDValue OutChains[6];
18774
18775     // Large code-model.
18776     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18777     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18778
18779     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18780     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18781
18782     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18783
18784     // Load the pointer to the nested function into R11.
18785     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18786     SDValue Addr = Trmp;
18787     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18788                                 Addr, MachinePointerInfo(TrmpAddr));
18789
18790     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18791                        DAG.getConstant(2, dl, MVT::i64));
18792     OutChains[1] =
18793         DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
18794                      /* Alignment = */ 2);
18795
18796     // Load the 'nest' parameter value into R10.
18797     // R10 is specified in X86CallingConv.td
18798     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18799     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18800                        DAG.getConstant(10, dl, MVT::i64));
18801     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18802                                 Addr, MachinePointerInfo(TrmpAddr, 10));
18803
18804     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18805                        DAG.getConstant(12, dl, MVT::i64));
18806     OutChains[3] =
18807         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
18808                      /* Alignment = */ 2);
18809
18810     // Jump to the nested function.
18811     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18812     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18813                        DAG.getConstant(20, dl, MVT::i64));
18814     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18815                                 Addr, MachinePointerInfo(TrmpAddr, 20));
18816
18817     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18818     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18819                        DAG.getConstant(22, dl, MVT::i64));
18820     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
18821                                 Addr, MachinePointerInfo(TrmpAddr, 22));
18822
18823     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18824   } else {
18825     const Function *Func =
18826       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18827     CallingConv::ID CC = Func->getCallingConv();
18828     unsigned NestReg;
18829
18830     switch (CC) {
18831     default:
18832       llvm_unreachable("Unsupported calling convention");
18833     case CallingConv::C:
18834     case CallingConv::X86_StdCall: {
18835       // Pass 'nest' parameter in ECX.
18836       // Must be kept in sync with X86CallingConv.td
18837       NestReg = X86::ECX;
18838
18839       // Check that ECX wasn't needed by an 'inreg' parameter.
18840       FunctionType *FTy = Func->getFunctionType();
18841       const AttributeSet &Attrs = Func->getAttributes();
18842
18843       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18844         unsigned InRegCount = 0;
18845         unsigned Idx = 1;
18846
18847         for (FunctionType::param_iterator I = FTy->param_begin(),
18848              E = FTy->param_end(); I != E; ++I, ++Idx)
18849           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
18850             auto &DL = DAG.getDataLayout();
18851             // FIXME: should only count parameters that are lowered to integers.
18852             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
18853           }
18854
18855         if (InRegCount > 2) {
18856           report_fatal_error("Nest register in use - reduce number of inreg"
18857                              " parameters!");
18858         }
18859       }
18860       break;
18861     }
18862     case CallingConv::X86_FastCall:
18863     case CallingConv::X86_ThisCall:
18864     case CallingConv::Fast:
18865       // Pass 'nest' parameter in EAX.
18866       // Must be kept in sync with X86CallingConv.td
18867       NestReg = X86::EAX;
18868       break;
18869     }
18870
18871     SDValue OutChains[4];
18872     SDValue Addr, Disp;
18873
18874     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18875                        DAG.getConstant(10, dl, MVT::i32));
18876     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18877
18878     // This is storing the opcode for MOV32ri.
18879     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18880     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18881     OutChains[0] =
18882         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
18883                      Trmp, MachinePointerInfo(TrmpAddr));
18884
18885     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18886                        DAG.getConstant(1, dl, MVT::i32));
18887     OutChains[1] =
18888         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
18889                      /* Alignment = */ 1);
18890
18891     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18892     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18893                        DAG.getConstant(5, dl, MVT::i32));
18894     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
18895                                 Addr, MachinePointerInfo(TrmpAddr, 5),
18896                                 /* Alignment = */ 1);
18897
18898     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18899                        DAG.getConstant(6, dl, MVT::i32));
18900     OutChains[3] =
18901         DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
18902                      /* Alignment = */ 1);
18903
18904     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18905   }
18906 }
18907
18908 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18909                                             SelectionDAG &DAG) const {
18910   /*
18911    The rounding mode is in bits 11:10 of FPSR, and has the following
18912    settings:
18913      00 Round to nearest
18914      01 Round to -inf
18915      10 Round to +inf
18916      11 Round to 0
18917
18918   FLT_ROUNDS, on the other hand, expects the following:
18919     -1 Undefined
18920      0 Round to 0
18921      1 Round to nearest
18922      2 Round to +inf
18923      3 Round to -inf
18924
18925   To perform the conversion, we do:
18926     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18927   */
18928
18929   MachineFunction &MF = DAG.getMachineFunction();
18930   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18931   unsigned StackAlignment = TFI.getStackAlignment();
18932   MVT VT = Op.getSimpleValueType();
18933   SDLoc DL(Op);
18934
18935   // Save FP Control Word to stack slot
18936   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18937   SDValue StackSlot =
18938       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
18939
18940   MachineMemOperand *MMO =
18941       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
18942                               MachineMemOperand::MOStore, 2, 2);
18943
18944   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18945   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18946                                           DAG.getVTList(MVT::Other),
18947                                           Ops, MVT::i16, MMO);
18948
18949   // Load FP Control Word from stack slot
18950   SDValue CWD =
18951       DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
18952
18953   // Transform as necessary
18954   SDValue CWD1 =
18955     DAG.getNode(ISD::SRL, DL, MVT::i16,
18956                 DAG.getNode(ISD::AND, DL, MVT::i16,
18957                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
18958                 DAG.getConstant(11, DL, MVT::i8));
18959   SDValue CWD2 =
18960     DAG.getNode(ISD::SRL, DL, MVT::i16,
18961                 DAG.getNode(ISD::AND, DL, MVT::i16,
18962                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
18963                 DAG.getConstant(9, DL, MVT::i8));
18964
18965   SDValue RetVal =
18966     DAG.getNode(ISD::AND, DL, MVT::i16,
18967                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18968                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18969                             DAG.getConstant(1, DL, MVT::i16)),
18970                 DAG.getConstant(3, DL, MVT::i16));
18971
18972   return DAG.getNode((VT.getSizeInBits() < 16 ?
18973                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18974 }
18975
18976 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
18977 //
18978 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
18979 //    to 512-bit vector.
18980 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
18981 //    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
18982 //    split the vector, perform operation on it's Lo a Hi part and
18983 //    concatenate the results.
18984 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
18985   assert(Op.getOpcode() == ISD::CTLZ);
18986   SDLoc dl(Op);
18987   MVT VT = Op.getSimpleValueType();
18988   MVT EltVT = VT.getVectorElementType();
18989   unsigned NumElems = VT.getVectorNumElements();
18990
18991   if (EltVT == MVT::i64 || EltVT == MVT::i32) {
18992     // Extend to 512 bit vector.
18993     assert((VT.is256BitVector() || VT.is128BitVector()) &&
18994               "Unsupported value type for operation");
18995
18996     MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
18997     SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
18998                                  DAG.getUNDEF(NewVT),
18999                                  Op.getOperand(0),
19000                                  DAG.getIntPtrConstant(0, dl));
19001     SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
19002
19003     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
19004                        DAG.getIntPtrConstant(0, dl));
19005   }
19006
19007   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
19008           "Unsupported element type");
19009
19010   if (16 < NumElems) {
19011     // Split vector, it's Lo and Hi parts will be handled in next iteration.
19012     SDValue Lo, Hi;
19013     std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
19014     MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
19015
19016     Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
19017     Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
19018
19019     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
19020   }
19021
19022   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
19023
19024   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
19025           "Unsupported value type for operation");
19026
19027   // Use native supported vector instruction vplzcntd.
19028   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
19029   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
19030   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
19031   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
19032
19033   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
19034 }
19035
19036 // Lower CTLZ using a PSHUFB lookup table implementation.
19037 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
19038                                        const X86Subtarget &Subtarget,
19039                                        SelectionDAG &DAG) {
19040   MVT VT = Op.getSimpleValueType();
19041   int NumElts = VT.getVectorNumElements();
19042   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
19043   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
19044
19045   // Per-nibble leading zero PSHUFB lookup table.
19046   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
19047                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
19048                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
19049                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
19050
19051   SmallVector<SDValue, 64> LUTVec;
19052   for (int i = 0; i < NumBytes; ++i)
19053     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
19054   SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
19055
19056   // Begin by bitcasting the input to byte vector, then split those bytes
19057   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
19058   // If the hi input nibble is zero then we add both results together, otherwise
19059   // we just take the hi result (by masking the lo result to zero before the
19060   // add).
19061   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
19062   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
19063
19064   SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
19065   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
19066   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
19067   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
19068   SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
19069
19070   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
19071   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
19072   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
19073   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
19074
19075   // Merge result back from vXi8 back to VT, working on the lo/hi halves
19076   // of the current vector width in the same way we did for the nibbles.
19077   // If the upper half of the input element is zero then add the halves'
19078   // leading zero counts together, otherwise just use the upper half's.
19079   // Double the width of the result until we are at target width.
19080   while (CurrVT != VT) {
19081     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
19082     int CurrNumElts = CurrVT.getVectorNumElements();
19083     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
19084     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
19085     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
19086
19087     // Check if the upper half of the input element is zero.
19088     SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
19089                                DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
19090     HiZ = DAG.getBitcast(NextVT, HiZ);
19091
19092     // Move the upper/lower halves to the lower bits as we'll be extending to
19093     // NextVT. Mask the lower result to zero if HiZ is true and add the results
19094     // together.
19095     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
19096     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
19097     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
19098     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
19099     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
19100     CurrVT = NextVT;
19101   }
19102
19103   return Res;
19104 }
19105
19106 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
19107                                const X86Subtarget &Subtarget,
19108                                SelectionDAG &DAG) {
19109   MVT VT = Op.getSimpleValueType();
19110   SDValue Op0 = Op.getOperand(0);
19111
19112   if (Subtarget.hasAVX512())
19113     return LowerVectorCTLZ_AVX512(Op, DAG);
19114
19115   // Decompose 256-bit ops into smaller 128-bit ops.
19116   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19117     unsigned NumElems = VT.getVectorNumElements();
19118
19119     // Extract each 128-bit vector, perform ctlz and concat the result.
19120     SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
19121     SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
19122
19123     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
19124                        DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
19125                        DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
19126   }
19127
19128   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
19129   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
19130 }
19131
19132 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
19133                          SelectionDAG &DAG) {
19134   MVT VT = Op.getSimpleValueType();
19135   MVT OpVT = VT;
19136   unsigned NumBits = VT.getSizeInBits();
19137   SDLoc dl(Op);
19138   unsigned Opc = Op.getOpcode();
19139
19140   if (VT.isVector())
19141     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
19142
19143   Op = Op.getOperand(0);
19144   if (VT == MVT::i8) {
19145     // Zero extend to i32 since there is not an i8 bsr.
19146     OpVT = MVT::i32;
19147     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
19148   }
19149
19150   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
19151   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
19152   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
19153
19154   if (Opc == ISD::CTLZ) {
19155     // If src is zero (i.e. bsr sets ZF), returns NumBits.
19156     SDValue Ops[] = {
19157       Op,
19158       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
19159       DAG.getConstant(X86::COND_E, dl, MVT::i8),
19160       Op.getValue(1)
19161     };
19162     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
19163   }
19164
19165   // Finally xor with NumBits-1.
19166   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
19167                    DAG.getConstant(NumBits - 1, dl, OpVT));
19168
19169   if (VT == MVT::i8)
19170     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
19171   return Op;
19172 }
19173
19174 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
19175   MVT VT = Op.getSimpleValueType();
19176   unsigned NumBits = VT.getScalarSizeInBits();
19177   SDLoc dl(Op);
19178
19179   if (VT.isVector()) {
19180     SDValue N0 = Op.getOperand(0);
19181     SDValue Zero = DAG.getConstant(0, dl, VT);
19182
19183     // lsb(x) = (x & -x)
19184     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
19185                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
19186
19187     // cttz_undef(x) = (width - 1) - ctlz(lsb)
19188     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
19189       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
19190       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
19191                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
19192     }
19193
19194     // cttz(x) = ctpop(lsb - 1)
19195     SDValue One = DAG.getConstant(1, dl, VT);
19196     return DAG.getNode(ISD::CTPOP, dl, VT,
19197                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
19198   }
19199
19200   assert(Op.getOpcode() == ISD::CTTZ &&
19201          "Only scalar CTTZ requires custom lowering");
19202
19203   // Issue a bsf (scan bits forward) which also sets EFLAGS.
19204   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19205   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
19206
19207   // If src is zero (i.e. bsf sets ZF), returns NumBits.
19208   SDValue Ops[] = {
19209     Op,
19210     DAG.getConstant(NumBits, dl, VT),
19211     DAG.getConstant(X86::COND_E, dl, MVT::i8),
19212     Op.getValue(1)
19213   };
19214   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
19215 }
19216
19217 /// Break a 256-bit integer operation into two new 128-bit ones and then
19218 /// concatenate the result back.
19219 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
19220   MVT VT = Op.getSimpleValueType();
19221
19222   assert(VT.is256BitVector() && VT.isInteger() &&
19223          "Unsupported value type for operation");
19224
19225   unsigned NumElems = VT.getVectorNumElements();
19226   SDLoc dl(Op);
19227
19228   // Extract the LHS vectors
19229   SDValue LHS = Op.getOperand(0);
19230   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
19231   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
19232
19233   // Extract the RHS vectors
19234   SDValue RHS = Op.getOperand(1);
19235   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
19236   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
19237
19238   MVT EltVT = VT.getVectorElementType();
19239   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19240
19241   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19242                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19243                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19244 }
19245
19246 /// Break a 512-bit integer operation into two new 256-bit ones and then
19247 /// concatenate the result back.
19248 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
19249   MVT VT = Op.getSimpleValueType();
19250
19251   assert(VT.is512BitVector() && VT.isInteger() &&
19252          "Unsupported value type for operation");
19253
19254   unsigned NumElems = VT.getVectorNumElements();
19255   SDLoc dl(Op);
19256
19257   // Extract the LHS vectors
19258   SDValue LHS = Op.getOperand(0);
19259   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
19260   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
19261
19262   // Extract the RHS vectors
19263   SDValue RHS = Op.getOperand(1);
19264   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
19265   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
19266
19267   MVT EltVT = VT.getVectorElementType();
19268   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19269
19270   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19271                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19272                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19273 }
19274
19275 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
19276   if (Op.getValueType() == MVT::i1)
19277     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19278                        Op.getOperand(0), Op.getOperand(1));
19279   assert(Op.getSimpleValueType().is256BitVector() &&
19280          Op.getSimpleValueType().isInteger() &&
19281          "Only handle AVX 256-bit vector integer operation");
19282   return Lower256IntArith(Op, DAG);
19283 }
19284
19285 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
19286   if (Op.getValueType() == MVT::i1)
19287     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19288                        Op.getOperand(0), Op.getOperand(1));
19289   assert(Op.getSimpleValueType().is256BitVector() &&
19290          Op.getSimpleValueType().isInteger() &&
19291          "Only handle AVX 256-bit vector integer operation");
19292   return Lower256IntArith(Op, DAG);
19293 }
19294
19295 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
19296   assert(Op.getSimpleValueType().is256BitVector() &&
19297          Op.getSimpleValueType().isInteger() &&
19298          "Only handle AVX 256-bit vector integer operation");
19299   return Lower256IntArith(Op, DAG);
19300 }
19301
19302 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
19303                         SelectionDAG &DAG) {
19304   SDLoc dl(Op);
19305   MVT VT = Op.getSimpleValueType();
19306
19307   if (VT == MVT::i1)
19308     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
19309
19310   // Decompose 256-bit ops into smaller 128-bit ops.
19311   if (VT.is256BitVector() && !Subtarget.hasInt256())
19312     return Lower256IntArith(Op, DAG);
19313
19314   SDValue A = Op.getOperand(0);
19315   SDValue B = Op.getOperand(1);
19316
19317   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
19318   // vector pairs, multiply and truncate.
19319   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
19320     if (Subtarget.hasInt256()) {
19321       // For 512-bit vectors, split into 256-bit vectors to allow the
19322       // sign-extension to occur.
19323       if (VT == MVT::v64i8)
19324         return Lower512IntArith(Op, DAG);
19325
19326       // For 256-bit vectors, split into 128-bit vectors to allow the
19327       // sign-extension to occur. We don't need this on AVX512BW as we can
19328       // safely sign-extend to v32i16.
19329       if (VT == MVT::v32i8 && !Subtarget.hasBWI())
19330         return Lower256IntArith(Op, DAG);
19331
19332       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
19333       return DAG.getNode(
19334           ISD::TRUNCATE, dl, VT,
19335           DAG.getNode(ISD::MUL, dl, ExVT,
19336                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
19337                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
19338     }
19339
19340     assert(VT == MVT::v16i8 &&
19341            "Pre-AVX2 support only supports v16i8 multiplication");
19342     MVT ExVT = MVT::v8i16;
19343
19344     // Extract the lo parts and sign extend to i16
19345     SDValue ALo, BLo;
19346     if (Subtarget.hasSSE41()) {
19347       ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
19348       BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
19349     } else {
19350       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19351                               -1, 4, -1, 5, -1, 6, -1, 7};
19352       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19353       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19354       ALo = DAG.getBitcast(ExVT, ALo);
19355       BLo = DAG.getBitcast(ExVT, BLo);
19356       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19357       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19358     }
19359
19360     // Extract the hi parts and sign extend to i16
19361     SDValue AHi, BHi;
19362     if (Subtarget.hasSSE41()) {
19363       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19364                               -1, -1, -1, -1, -1, -1, -1, -1};
19365       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19366       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19367       AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
19368       BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
19369     } else {
19370       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
19371                               -1, 12, -1, 13, -1, 14, -1, 15};
19372       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19373       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19374       AHi = DAG.getBitcast(ExVT, AHi);
19375       BHi = DAG.getBitcast(ExVT, BHi);
19376       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19377       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19378     }
19379
19380     // Multiply, mask the lower 8bits of the lo/hi results and pack
19381     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19382     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19383     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
19384     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
19385     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19386   }
19387
19388   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
19389   if (VT == MVT::v4i32) {
19390     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
19391            "Should not custom lower when pmuldq is available!");
19392
19393     // Extract the odd parts.
19394     static const int UnpackMask[] = { 1, -1, 3, -1 };
19395     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
19396     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
19397
19398     // Multiply the even parts.
19399     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
19400     // Now multiply odd parts.
19401     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
19402
19403     Evens = DAG.getBitcast(VT, Evens);
19404     Odds = DAG.getBitcast(VT, Odds);
19405
19406     // Merge the two vectors back together with a shuffle. This expands into 2
19407     // shuffles.
19408     static const int ShufMask[] = { 0, 4, 2, 6 };
19409     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
19410   }
19411
19412   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
19413          "Only know how to lower V2I64/V4I64/V8I64 multiply");
19414
19415   //  Ahi = psrlqi(a, 32);
19416   //  Bhi = psrlqi(b, 32);
19417   //
19418   //  AloBlo = pmuludq(a, b);
19419   //  AloBhi = pmuludq(a, Bhi);
19420   //  AhiBlo = pmuludq(Ahi, b);
19421
19422   //  AloBhi = psllqi(AloBhi, 32);
19423   //  AhiBlo = psllqi(AhiBlo, 32);
19424   //  return AloBlo + AloBhi + AhiBlo;
19425
19426   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
19427   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
19428
19429   SDValue AhiBlo = Ahi;
19430   SDValue AloBhi = Bhi;
19431   // Bit cast to 32-bit vectors for MULUDQ
19432   MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
19433                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
19434   A = DAG.getBitcast(MulVT, A);
19435   B = DAG.getBitcast(MulVT, B);
19436   Ahi = DAG.getBitcast(MulVT, Ahi);
19437   Bhi = DAG.getBitcast(MulVT, Bhi);
19438
19439   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
19440   // After shifting right const values the result may be all-zero.
19441   if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
19442     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
19443     AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
19444   }
19445   if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
19446     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
19447     AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
19448   }
19449
19450   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
19451   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
19452 }
19453
19454 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
19455                          SelectionDAG &DAG) {
19456   SDLoc dl(Op);
19457   MVT VT = Op.getSimpleValueType();
19458
19459   // Decompose 256-bit ops into smaller 128-bit ops.
19460   if (VT.is256BitVector() && !Subtarget.hasInt256())
19461     return Lower256IntArith(Op, DAG);
19462
19463   // Only i8 vectors should need custom lowering after this.
19464   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
19465          "Unsupported vector type");
19466
19467   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
19468   // logical shift down the upper half and pack back to i8.
19469   SDValue A = Op.getOperand(0);
19470   SDValue B = Op.getOperand(1);
19471
19472   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
19473   // and then ashr/lshr the upper bits down to the lower bits before multiply.
19474   unsigned Opcode = Op.getOpcode();
19475   unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
19476   unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
19477
19478   // AVX2 implementations - extend xmm subvectors to ymm.
19479   if (Subtarget.hasInt256()) {
19480     SDValue Lo = DAG.getIntPtrConstant(0, dl);
19481     SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
19482
19483     if (VT == MVT::v32i8) {
19484       SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
19485       SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
19486       SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
19487       SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
19488       ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
19489       BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
19490       AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
19491       BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
19492       Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19493                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
19494                        DAG.getConstant(8, dl, MVT::v16i16));
19495       Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19496                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
19497                        DAG.getConstant(8, dl, MVT::v16i16));
19498       // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
19499       // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
19500       const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
19501                             16, 17, 18, 19, 20, 21, 22, 23};
19502       const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19503                             24, 25, 26, 27, 28, 29, 30, 31};
19504       return DAG.getNode(X86ISD::PACKUS, dl, VT,
19505                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
19506                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
19507     }
19508
19509     SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
19510     SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
19511     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
19512     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
19513                                DAG.getConstant(8, dl, MVT::v16i16));
19514     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
19515     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
19516     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
19517   }
19518
19519   assert(VT == MVT::v16i8 &&
19520          "Pre-AVX2 support only supports v16i8 multiplication");
19521   MVT ExVT = MVT::v8i16;
19522
19523   // Extract the lo parts and zero/sign extend to i16.
19524   SDValue ALo, BLo;
19525   if (Subtarget.hasSSE41()) {
19526     ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
19527     BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
19528   } else {
19529     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19530                             -1, 4, -1, 5, -1, 6, -1, 7};
19531     ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19532     BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19533     ALo = DAG.getBitcast(ExVT, ALo);
19534     BLo = DAG.getBitcast(ExVT, BLo);
19535     ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19536     BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19537   }
19538
19539   // Extract the hi parts and zero/sign extend to i16.
19540   SDValue AHi, BHi;
19541   if (Subtarget.hasSSE41()) {
19542     const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19543                             -1, -1, -1, -1, -1, -1, -1, -1};
19544     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19545     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19546     AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
19547     BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
19548   } else {
19549     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
19550                             -1, 12, -1, 13, -1, 14, -1, 15};
19551     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19552     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19553     AHi = DAG.getBitcast(ExVT, AHi);
19554     BHi = DAG.getBitcast(ExVT, BHi);
19555     AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19556     BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19557   }
19558
19559   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
19560   // pack back to v16i8.
19561   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19562   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19563   RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
19564   RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
19565   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19566 }
19567
19568 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
19569   assert(Subtarget.isTargetWin64() && "Unexpected target");
19570   EVT VT = Op.getValueType();
19571   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
19572          "Unexpected return type for lowering");
19573
19574   RTLIB::Libcall LC;
19575   bool isSigned;
19576   switch (Op->getOpcode()) {
19577   default: llvm_unreachable("Unexpected request for libcall!");
19578   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
19579   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
19580   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
19581   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
19582   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
19583   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
19584   }
19585
19586   SDLoc dl(Op);
19587   SDValue InChain = DAG.getEntryNode();
19588
19589   TargetLowering::ArgListTy Args;
19590   TargetLowering::ArgListEntry Entry;
19591   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
19592     EVT ArgVT = Op->getOperand(i).getValueType();
19593     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
19594            "Unexpected argument type for lowering");
19595     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
19596     Entry.Node = StackPtr;
19597     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
19598                            MachinePointerInfo(), /* Alignment = */ 16);
19599     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19600     Entry.Ty = PointerType::get(ArgTy,0);
19601     Entry.isSExt = false;
19602     Entry.isZExt = false;
19603     Args.push_back(Entry);
19604   }
19605
19606   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
19607                                          getPointerTy(DAG.getDataLayout()));
19608
19609   TargetLowering::CallLoweringInfo CLI(DAG);
19610   CLI.setDebugLoc(dl).setChain(InChain)
19611     .setCallee(getLibcallCallingConv(LC),
19612                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
19613                Callee, std::move(Args))
19614     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
19615
19616   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
19617   return DAG.getBitcast(VT, CallInfo.first);
19618 }
19619
19620 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
19621                              SelectionDAG &DAG) {
19622   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
19623   MVT VT = Op0.getSimpleValueType();
19624   SDLoc dl(Op);
19625
19626   // Decompose 256-bit ops into smaller 128-bit ops.
19627   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19628     unsigned Opcode = Op.getOpcode();
19629     unsigned NumElems = VT.getVectorNumElements();
19630     MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
19631     SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
19632     SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
19633     SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
19634     SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
19635     SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
19636     SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
19637     SDValue Ops[] = {
19638       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
19639       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
19640     };
19641     return DAG.getMergeValues(Ops, dl);
19642   }
19643
19644   assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
19645          (VT == MVT::v8i32 && Subtarget.hasInt256()));
19646
19647   // PMULxD operations multiply each even value (starting at 0) of LHS with
19648   // the related value of RHS and produce a widen result.
19649   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19650   // => <2 x i64> <ae|cg>
19651   //
19652   // In other word, to have all the results, we need to perform two PMULxD:
19653   // 1. one with the even values.
19654   // 2. one with the odd values.
19655   // To achieve #2, with need to place the odd values at an even position.
19656   //
19657   // Place the odd value at an even position (basically, shift all values 1
19658   // step to the left):
19659   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
19660   // <a|b|c|d> => <b|undef|d|undef>
19661   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
19662                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19663   // <e|f|g|h> => <f|undef|h|undef>
19664   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
19665                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19666
19667   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
19668   // ints.
19669   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
19670   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
19671   unsigned Opcode =
19672       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
19673   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19674   // => <2 x i64> <ae|cg>
19675   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
19676   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
19677   // => <2 x i64> <bf|dh>
19678   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
19679
19680   // Shuffle it back into the right order.
19681   SDValue Highs, Lows;
19682   if (VT == MVT::v8i32) {
19683     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
19684     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19685     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
19686     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19687   } else {
19688     const int HighMask[] = {1, 5, 3, 7};
19689     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19690     const int LowMask[] = {0, 4, 2, 6};
19691     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19692   }
19693
19694   // If we have a signed multiply but no PMULDQ fix up the high parts of a
19695   // unsigned multiply.
19696   if (IsSigned && !Subtarget.hasSSE41()) {
19697     SDValue ShAmt = DAG.getConstant(
19698         31, dl,
19699         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
19700     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
19701                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
19702     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
19703                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
19704
19705     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
19706     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
19707   }
19708
19709   // The first result of MUL_LOHI is actually the low value, followed by the
19710   // high value.
19711   SDValue Ops[] = {Lows, Highs};
19712   return DAG.getMergeValues(Ops, dl);
19713 }
19714
19715 // Return true if the required (according to Opcode) shift-imm form is natively
19716 // supported by the Subtarget
19717 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
19718                                         unsigned Opcode) {
19719   if (VT.getScalarSizeInBits() < 16)
19720     return false;
19721
19722   if (VT.is512BitVector() &&
19723       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
19724     return true;
19725
19726   bool LShift = VT.is128BitVector() ||
19727     (VT.is256BitVector() && Subtarget.hasInt256());
19728
19729   bool AShift = LShift && (Subtarget.hasVLX() ||
19730     (VT != MVT::v2i64 && VT != MVT::v4i64));
19731   return (Opcode == ISD::SRA) ? AShift : LShift;
19732 }
19733
19734 // The shift amount is a variable, but it is the same for all vector lanes.
19735 // These instructions are defined together with shift-immediate.
19736 static
19737 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
19738                                       unsigned Opcode) {
19739   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
19740 }
19741
19742 // Return true if the required (according to Opcode) variable-shift form is
19743 // natively supported by the Subtarget
19744 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
19745                                     unsigned Opcode) {
19746
19747   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
19748     return false;
19749
19750   // vXi16 supported only on AVX-512, BWI
19751   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
19752     return false;
19753
19754   if (VT.is512BitVector() || Subtarget.hasVLX())
19755     return true;
19756
19757   bool LShift = VT.is128BitVector() || VT.is256BitVector();
19758   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
19759   return (Opcode == ISD::SRA) ? AShift : LShift;
19760 }
19761
19762 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
19763                                          const X86Subtarget &Subtarget) {
19764   MVT VT = Op.getSimpleValueType();
19765   SDLoc dl(Op);
19766   SDValue R = Op.getOperand(0);
19767   SDValue Amt = Op.getOperand(1);
19768
19769   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19770     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19771
19772   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
19773     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
19774     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
19775     SDValue Ex = DAG.getBitcast(ExVT, R);
19776
19777     if (ShiftAmt >= 32) {
19778       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
19779       SDValue Upper =
19780           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
19781       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19782                                                  ShiftAmt - 32, DAG);
19783       if (VT == MVT::v2i64)
19784         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
19785       if (VT == MVT::v4i64)
19786         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19787                                   {9, 1, 11, 3, 13, 5, 15, 7});
19788     } else {
19789       // SRA upper i32, SHL whole i64 and select lower i32.
19790       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19791                                                  ShiftAmt, DAG);
19792       SDValue Lower =
19793           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
19794       Lower = DAG.getBitcast(ExVT, Lower);
19795       if (VT == MVT::v2i64)
19796         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
19797       if (VT == MVT::v4i64)
19798         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19799                                   {8, 1, 10, 3, 12, 5, 14, 7});
19800     }
19801     return DAG.getBitcast(VT, Ex);
19802   };
19803
19804   // Optimize shl/srl/sra with constant shift amount.
19805   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
19806     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
19807       uint64_t ShiftAmt = ShiftConst->getZExtValue();
19808
19809       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19810         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19811
19812       // i64 SRA needs to be performed as partial shifts.
19813       if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
19814           Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
19815         return ArithmeticShiftRight64(ShiftAmt);
19816
19817       if (VT == MVT::v16i8 ||
19818           (Subtarget.hasInt256() && VT == MVT::v32i8) ||
19819           VT == MVT::v64i8) {
19820         unsigned NumElts = VT.getVectorNumElements();
19821         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
19822
19823         // Simple i8 add case
19824         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
19825           return DAG.getNode(ISD::ADD, dl, VT, R, R);
19826
19827         // ashr(R, 7)  === cmp_slt(R, 0)
19828         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
19829           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
19830           if (VT.is512BitVector()) {
19831             assert(VT == MVT::v64i8 && "Unexpected element type!");
19832             SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
19833             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
19834           }
19835           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
19836         }
19837
19838         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
19839         if (VT == MVT::v16i8 && Subtarget.hasXOP())
19840           return SDValue();
19841
19842         if (Op.getOpcode() == ISD::SHL) {
19843           // Make a large shift.
19844           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
19845                                                    R, ShiftAmt, DAG);
19846           SHL = DAG.getBitcast(VT, SHL);
19847           // Zero out the rightmost bits.
19848           return DAG.getNode(ISD::AND, dl, VT, SHL,
19849                              DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
19850         }
19851         if (Op.getOpcode() == ISD::SRL) {
19852           // Make a large shift.
19853           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
19854                                                    R, ShiftAmt, DAG);
19855           SRL = DAG.getBitcast(VT, SRL);
19856           // Zero out the leftmost bits.
19857           return DAG.getNode(ISD::AND, dl, VT, SRL,
19858                              DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
19859         }
19860         if (Op.getOpcode() == ISD::SRA) {
19861           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
19862           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
19863
19864           SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
19865           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
19866           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
19867           return Res;
19868         }
19869         llvm_unreachable("Unknown shift opcode.");
19870       }
19871     }
19872   }
19873
19874   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19875   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
19876       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
19877
19878     // Peek through any splat that was introduced for i64 shift vectorization.
19879     int SplatIndex = -1;
19880     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
19881       if (SVN->isSplat()) {
19882         SplatIndex = SVN->getSplatIndex();
19883         Amt = Amt.getOperand(0);
19884         assert(SplatIndex < (int)VT.getVectorNumElements() &&
19885                "Splat shuffle referencing second operand");
19886       }
19887
19888     if (Amt.getOpcode() != ISD::BITCAST ||
19889         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
19890       return SDValue();
19891
19892     Amt = Amt.getOperand(0);
19893     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19894                      VT.getVectorNumElements();
19895     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
19896     uint64_t ShiftAmt = 0;
19897     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
19898     for (unsigned i = 0; i != Ratio; ++i) {
19899       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
19900       if (!C)
19901         return SDValue();
19902       // 6 == Log2(64)
19903       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
19904     }
19905
19906     // Check remaining shift amounts (if not a splat).
19907     if (SplatIndex < 0) {
19908       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19909         uint64_t ShAmt = 0;
19910         for (unsigned j = 0; j != Ratio; ++j) {
19911           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
19912           if (!C)
19913             return SDValue();
19914           // 6 == Log2(64)
19915           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
19916         }
19917         if (ShAmt != ShiftAmt)
19918           return SDValue();
19919       }
19920     }
19921
19922     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19923       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19924
19925     if (Op.getOpcode() == ISD::SRA)
19926       return ArithmeticShiftRight64(ShiftAmt);
19927   }
19928
19929   return SDValue();
19930 }
19931
19932 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
19933                                         const X86Subtarget &Subtarget) {
19934   MVT VT = Op.getSimpleValueType();
19935   SDLoc dl(Op);
19936   SDValue R = Op.getOperand(0);
19937   SDValue Amt = Op.getOperand(1);
19938
19939   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19940     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19941
19942   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
19943     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
19944
19945   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
19946     SDValue BaseShAmt;
19947     MVT EltVT = VT.getVectorElementType();
19948
19949     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
19950       // Check if this build_vector node is doing a splat.
19951       // If so, then set BaseShAmt equal to the splat value.
19952       BaseShAmt = BV->getSplatValue();
19953       if (BaseShAmt && BaseShAmt.isUndef())
19954         BaseShAmt = SDValue();
19955     } else {
19956       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
19957         Amt = Amt.getOperand(0);
19958
19959       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
19960       if (SVN && SVN->isSplat()) {
19961         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
19962         SDValue InVec = Amt.getOperand(0);
19963         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
19964           assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
19965                  "Unexpected shuffle index found!");
19966           BaseShAmt = InVec.getOperand(SplatIdx);
19967         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
19968            if (ConstantSDNode *C =
19969                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
19970              if (C->getZExtValue() == SplatIdx)
19971                BaseShAmt = InVec.getOperand(1);
19972            }
19973         }
19974
19975         if (!BaseShAmt)
19976           // Avoid introducing an extract element from a shuffle.
19977           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
19978                                   DAG.getIntPtrConstant(SplatIdx, dl));
19979       }
19980     }
19981
19982     if (BaseShAmt.getNode()) {
19983       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
19984       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
19985         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
19986       else if (EltVT.bitsLT(MVT::i32))
19987         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
19988
19989       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
19990     }
19991   }
19992
19993   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19994   if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
19995       Amt.getOpcode() == ISD::BITCAST &&
19996       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
19997     Amt = Amt.getOperand(0);
19998     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19999                      VT.getVectorNumElements();
20000     std::vector<SDValue> Vals(Ratio);
20001     for (unsigned i = 0; i != Ratio; ++i)
20002       Vals[i] = Amt.getOperand(i);
20003     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
20004       for (unsigned j = 0; j != Ratio; ++j)
20005         if (Vals[j] != Amt.getOperand(i + j))
20006           return SDValue();
20007     }
20008
20009     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
20010       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
20011   }
20012   return SDValue();
20013 }
20014
20015 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
20016                           SelectionDAG &DAG) {
20017   MVT VT = Op.getSimpleValueType();
20018   SDLoc dl(Op);
20019   SDValue R = Op.getOperand(0);
20020   SDValue Amt = Op.getOperand(1);
20021   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20022
20023   assert(VT.isVector() && "Custom lowering only for vector shifts!");
20024   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
20025
20026   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
20027     return V;
20028
20029   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
20030     return V;
20031
20032   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
20033     return Op;
20034
20035   // XOP has 128-bit variable logical/arithmetic shifts.
20036   // +ve/-ve Amt = shift left/right.
20037   if (Subtarget.hasXOP() &&
20038       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
20039        VT == MVT::v8i16 || VT == MVT::v16i8)) {
20040     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
20041       SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20042       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
20043     }
20044     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
20045       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
20046     if (Op.getOpcode() == ISD::SRA)
20047       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
20048   }
20049
20050   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
20051   // shifts per-lane and then shuffle the partial results back together.
20052   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
20053     // Splat the shift amounts so the scalar shifts above will catch it.
20054     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
20055     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
20056     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
20057     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
20058     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
20059   }
20060
20061   // i64 vector arithmetic shift can be emulated with the transform:
20062   // M = lshr(SIGN_BIT, Amt)
20063   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
20064   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
20065       Op.getOpcode() == ISD::SRA) {
20066     SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
20067     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
20068     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
20069     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
20070     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
20071     return R;
20072   }
20073
20074   // If possible, lower this packed shift into a vector multiply instead of
20075   // expanding it into a sequence of scalar shifts.
20076   // Do this only if the vector shift count is a constant build_vector.
20077   if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
20078       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
20079        (Subtarget.hasInt256() && VT == MVT::v16i16))) {
20080     SmallVector<SDValue, 8> Elts;
20081     MVT SVT = VT.getVectorElementType();
20082     unsigned SVTBits = SVT.getSizeInBits();
20083     APInt One(SVTBits, 1);
20084     unsigned NumElems = VT.getVectorNumElements();
20085
20086     for (unsigned i=0; i !=NumElems; ++i) {
20087       SDValue Op = Amt->getOperand(i);
20088       if (Op->isUndef()) {
20089         Elts.push_back(Op);
20090         continue;
20091       }
20092
20093       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
20094       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
20095       uint64_t ShAmt = C.getZExtValue();
20096       if (ShAmt >= SVTBits) {
20097         Elts.push_back(DAG.getUNDEF(SVT));
20098         continue;
20099       }
20100       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
20101     }
20102     SDValue BV = DAG.getBuildVector(VT, dl, Elts);
20103     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
20104   }
20105
20106   // Lower SHL with variable shift amount.
20107   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
20108     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
20109
20110     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
20111                      DAG.getConstant(0x3f800000U, dl, VT));
20112     Op = DAG.getBitcast(MVT::v4f32, Op);
20113     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
20114     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
20115   }
20116
20117   // If possible, lower this shift as a sequence of two shifts by
20118   // constant plus a MOVSS/MOVSD instead of scalarizing it.
20119   // Example:
20120   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
20121   //
20122   // Could be rewritten as:
20123   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
20124   //
20125   // The advantage is that the two shifts from the example would be
20126   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
20127   // the vector shift into four scalar shifts plus four pairs of vector
20128   // insert/extract.
20129   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
20130     unsigned TargetOpcode = X86ISD::MOVSS;
20131     bool CanBeSimplified;
20132     // The splat value for the first packed shift (the 'X' from the example).
20133     SDValue Amt1 = Amt->getOperand(0);
20134     // The splat value for the second packed shift (the 'Y' from the example).
20135     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
20136
20137     // See if it is possible to replace this node with a sequence of
20138     // two shifts followed by a MOVSS/MOVSD
20139     if (VT == MVT::v4i32) {
20140       // Check if it is legal to use a MOVSS.
20141       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
20142                         Amt2 == Amt->getOperand(3);
20143       if (!CanBeSimplified) {
20144         // Otherwise, check if we can still simplify this node using a MOVSD.
20145         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
20146                           Amt->getOperand(2) == Amt->getOperand(3);
20147         TargetOpcode = X86ISD::MOVSD;
20148         Amt2 = Amt->getOperand(2);
20149       }
20150     } else {
20151       // Do similar checks for the case where the machine value type
20152       // is MVT::v8i16.
20153       CanBeSimplified = Amt1 == Amt->getOperand(1);
20154       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
20155         CanBeSimplified = Amt2 == Amt->getOperand(i);
20156
20157       if (!CanBeSimplified) {
20158         TargetOpcode = X86ISD::MOVSD;
20159         CanBeSimplified = true;
20160         Amt2 = Amt->getOperand(4);
20161         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
20162           CanBeSimplified = Amt1 == Amt->getOperand(i);
20163         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
20164           CanBeSimplified = Amt2 == Amt->getOperand(j);
20165       }
20166     }
20167
20168     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
20169         isa<ConstantSDNode>(Amt2)) {
20170       // Replace this node with two shifts followed by a MOVSS/MOVSD.
20171       MVT CastVT = MVT::v4i32;
20172       SDValue Splat1 =
20173         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
20174       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
20175       SDValue Splat2 =
20176         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
20177       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
20178       if (TargetOpcode == X86ISD::MOVSD)
20179         CastVT = MVT::v2i64;
20180       SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
20181       SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
20182       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
20183                                             BitCast1, DAG);
20184       return DAG.getBitcast(VT, Result);
20185     }
20186   }
20187
20188   // v4i32 Non Uniform Shifts.
20189   // If the shift amount is constant we can shift each lane using the SSE2
20190   // immediate shifts, else we need to zero-extend each lane to the lower i64
20191   // and shift using the SSE2 variable shifts.
20192   // The separate results can then be blended together.
20193   if (VT == MVT::v4i32) {
20194     unsigned Opc = Op.getOpcode();
20195     SDValue Amt0, Amt1, Amt2, Amt3;
20196     if (ConstantAmt) {
20197       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
20198       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
20199       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
20200       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
20201     } else {
20202       // ISD::SHL is handled above but we include it here for completeness.
20203       switch (Opc) {
20204       default:
20205         llvm_unreachable("Unknown target vector shift node");
20206       case ISD::SHL:
20207         Opc = X86ISD::VSHL;
20208         break;
20209       case ISD::SRL:
20210         Opc = X86ISD::VSRL;
20211         break;
20212       case ISD::SRA:
20213         Opc = X86ISD::VSRA;
20214         break;
20215       }
20216       // The SSE2 shifts use the lower i64 as the same shift amount for
20217       // all lanes and the upper i64 is ignored. These shuffle masks
20218       // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
20219       SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20220       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
20221       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
20222       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
20223       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
20224     }
20225
20226     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
20227     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
20228     SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
20229     SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
20230     SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
20231     SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
20232     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
20233   }
20234
20235   if (VT == MVT::v16i8 ||
20236       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
20237     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
20238     unsigned ShiftOpcode = Op->getOpcode();
20239
20240     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
20241       // On SSE41 targets we make use of the fact that VSELECT lowers
20242       // to PBLENDVB which selects bytes based just on the sign bit.
20243       if (Subtarget.hasSSE41()) {
20244         V0 = DAG.getBitcast(VT, V0);
20245         V1 = DAG.getBitcast(VT, V1);
20246         Sel = DAG.getBitcast(VT, Sel);
20247         return DAG.getBitcast(SelVT,
20248                               DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
20249       }
20250       // On pre-SSE41 targets we test for the sign bit by comparing to
20251       // zero - a negative value will set all bits of the lanes to true
20252       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
20253       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
20254       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
20255       return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
20256     };
20257
20258     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
20259     // We can safely do this using i16 shifts as we're only interested in
20260     // the 3 lower bits of each byte.
20261     Amt = DAG.getBitcast(ExtVT, Amt);
20262     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
20263     Amt = DAG.getBitcast(VT, Amt);
20264
20265     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
20266       // r = VSELECT(r, shift(r, 4), a);
20267       SDValue M =
20268           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20269       R = SignBitSelect(VT, Amt, M, R);
20270
20271       // a += a
20272       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20273
20274       // r = VSELECT(r, shift(r, 2), a);
20275       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20276       R = SignBitSelect(VT, Amt, M, R);
20277
20278       // a += a
20279       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20280
20281       // return VSELECT(r, shift(r, 1), a);
20282       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20283       R = SignBitSelect(VT, Amt, M, R);
20284       return R;
20285     }
20286
20287     if (Op->getOpcode() == ISD::SRA) {
20288       // For SRA we need to unpack each byte to the higher byte of a i16 vector
20289       // so we can correctly sign extend. We don't care what happens to the
20290       // lower byte.
20291       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
20292       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
20293       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
20294       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
20295       ALo = DAG.getBitcast(ExtVT, ALo);
20296       AHi = DAG.getBitcast(ExtVT, AHi);
20297       RLo = DAG.getBitcast(ExtVT, RLo);
20298       RHi = DAG.getBitcast(ExtVT, RHi);
20299
20300       // r = VSELECT(r, shift(r, 4), a);
20301       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20302                                 DAG.getConstant(4, dl, ExtVT));
20303       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20304                                 DAG.getConstant(4, dl, ExtVT));
20305       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20306       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20307
20308       // a += a
20309       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20310       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20311
20312       // r = VSELECT(r, shift(r, 2), a);
20313       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20314                         DAG.getConstant(2, dl, ExtVT));
20315       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20316                         DAG.getConstant(2, dl, ExtVT));
20317       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20318       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20319
20320       // a += a
20321       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20322       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20323
20324       // r = VSELECT(r, shift(r, 1), a);
20325       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20326                         DAG.getConstant(1, dl, ExtVT));
20327       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20328                         DAG.getConstant(1, dl, ExtVT));
20329       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20330       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20331
20332       // Logical shift the result back to the lower byte, leaving a zero upper
20333       // byte
20334       // meaning that we can safely pack with PACKUSWB.
20335       RLo =
20336           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
20337       RHi =
20338           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
20339       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20340     }
20341   }
20342
20343   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
20344   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
20345   // solution better.
20346   if (Subtarget.hasInt256() && VT == MVT::v8i16) {
20347     MVT ExtVT = MVT::v8i32;
20348     unsigned ExtOpc =
20349         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20350     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
20351     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
20352     return DAG.getNode(ISD::TRUNCATE, dl, VT,
20353                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
20354   }
20355
20356   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
20357     MVT ExtVT = MVT::v8i32;
20358     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20359     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
20360     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
20361     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
20362     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
20363     ALo = DAG.getBitcast(ExtVT, ALo);
20364     AHi = DAG.getBitcast(ExtVT, AHi);
20365     RLo = DAG.getBitcast(ExtVT, RLo);
20366     RHi = DAG.getBitcast(ExtVT, RHi);
20367     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
20368     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
20369     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
20370     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
20371     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20372   }
20373
20374   if (VT == MVT::v8i16) {
20375     unsigned ShiftOpcode = Op->getOpcode();
20376
20377     // If we have a constant shift amount, the non-SSE41 path is best as
20378     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
20379     bool UseSSE41 = Subtarget.hasSSE41() &&
20380                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20381
20382     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
20383       // On SSE41 targets we make use of the fact that VSELECT lowers
20384       // to PBLENDVB which selects bytes based just on the sign bit.
20385       if (UseSSE41) {
20386         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
20387         V0 = DAG.getBitcast(ExtVT, V0);
20388         V1 = DAG.getBitcast(ExtVT, V1);
20389         Sel = DAG.getBitcast(ExtVT, Sel);
20390         return DAG.getBitcast(
20391             VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
20392       }
20393       // On pre-SSE41 targets we splat the sign bit - a negative value will
20394       // set all bits of the lanes to true and VSELECT uses that in
20395       // its OR(AND(V0,C),AND(V1,~C)) lowering.
20396       SDValue C =
20397           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
20398       return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
20399     };
20400
20401     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
20402     if (UseSSE41) {
20403       // On SSE41 targets we need to replicate the shift mask in both
20404       // bytes for PBLENDVB.
20405       Amt = DAG.getNode(
20406           ISD::OR, dl, VT,
20407           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
20408           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
20409     } else {
20410       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
20411     }
20412
20413     // r = VSELECT(r, shift(r, 8), a);
20414     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
20415     R = SignBitSelect(Amt, M, R);
20416
20417     // a += a
20418     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20419
20420     // r = VSELECT(r, shift(r, 4), a);
20421     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20422     R = SignBitSelect(Amt, M, R);
20423
20424     // a += a
20425     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20426
20427     // r = VSELECT(r, shift(r, 2), a);
20428     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20429     R = SignBitSelect(Amt, M, R);
20430
20431     // a += a
20432     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20433
20434     // return VSELECT(r, shift(r, 1), a);
20435     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20436     R = SignBitSelect(Amt, M, R);
20437     return R;
20438   }
20439
20440   // Decompose 256-bit shifts into smaller 128-bit shifts.
20441   if (VT.is256BitVector())
20442     return Lower256IntArith(Op, DAG);
20443
20444   return SDValue();
20445 }
20446
20447 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
20448                            SelectionDAG &DAG) {
20449   MVT VT = Op.getSimpleValueType();
20450   SDLoc DL(Op);
20451   SDValue R = Op.getOperand(0);
20452   SDValue Amt = Op.getOperand(1);
20453
20454   assert(VT.isVector() && "Custom lowering only for vector rotates!");
20455   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
20456   assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
20457
20458   // XOP has 128-bit vector variable + immediate rotates.
20459   // +ve/-ve Amt = rotate left/right.
20460
20461   // Split 256-bit integers.
20462   if (VT.is256BitVector())
20463     return Lower256IntArith(Op, DAG);
20464
20465   assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
20466
20467   // Attempt to rotate by immediate.
20468   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
20469     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
20470       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
20471       assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
20472       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
20473                          DAG.getConstant(RotateAmt, DL, MVT::i8));
20474     }
20475   }
20476
20477   // Use general rotate by variable (per-element).
20478   return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
20479 }
20480
20481 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
20482   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
20483   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
20484   // looks for this combo and may remove the "setcc" instruction if the "setcc"
20485   // has only one use.
20486   SDNode *N = Op.getNode();
20487   SDValue LHS = N->getOperand(0);
20488   SDValue RHS = N->getOperand(1);
20489   unsigned BaseOp = 0;
20490   unsigned Cond = 0;
20491   SDLoc DL(Op);
20492   switch (Op.getOpcode()) {
20493   default: llvm_unreachable("Unknown ovf instruction!");
20494   case ISD::SADDO:
20495     // A subtract of one will be selected as a INC. Note that INC doesn't
20496     // set CF, so we can't do this for UADDO.
20497     if (isOneConstant(RHS)) {
20498         BaseOp = X86ISD::INC;
20499         Cond = X86::COND_O;
20500         break;
20501       }
20502     BaseOp = X86ISD::ADD;
20503     Cond = X86::COND_O;
20504     break;
20505   case ISD::UADDO:
20506     BaseOp = X86ISD::ADD;
20507     Cond = X86::COND_B;
20508     break;
20509   case ISD::SSUBO:
20510     // A subtract of one will be selected as a DEC. Note that DEC doesn't
20511     // set CF, so we can't do this for USUBO.
20512     if (isOneConstant(RHS)) {
20513         BaseOp = X86ISD::DEC;
20514         Cond = X86::COND_O;
20515         break;
20516       }
20517     BaseOp = X86ISD::SUB;
20518     Cond = X86::COND_O;
20519     break;
20520   case ISD::USUBO:
20521     BaseOp = X86ISD::SUB;
20522     Cond = X86::COND_B;
20523     break;
20524   case ISD::SMULO:
20525     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
20526     Cond = X86::COND_O;
20527     break;
20528   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
20529     if (N->getValueType(0) == MVT::i8) {
20530       BaseOp = X86ISD::UMUL8;
20531       Cond = X86::COND_O;
20532       break;
20533     }
20534     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
20535                                  MVT::i32);
20536     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
20537
20538     SDValue SetCC =
20539       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20540                   DAG.getConstant(X86::COND_O, DL, MVT::i32),
20541                   SDValue(Sum.getNode(), 2));
20542
20543     if (N->getValueType(1) == MVT::i1) {
20544       SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20545                           DAG.getValueType(MVT::i1));
20546       SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20547     }
20548     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20549   }
20550   }
20551
20552   // Also sets EFLAGS.
20553   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
20554   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
20555
20556   SDValue SetCC =
20557     DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20558                 DAG.getConstant(Cond, DL, MVT::i32),
20559                 SDValue(Sum.getNode(), 1));
20560
20561   if (N->getValueType(1) == MVT::i1) {
20562     SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20563                         DAG.getValueType(MVT::i1));
20564     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20565   }
20566   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20567 }
20568
20569 /// Returns true if the operand type is exactly twice the native width, and
20570 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
20571 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
20572 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
20573 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
20574   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
20575
20576   if (OpWidth == 64)
20577     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
20578   else if (OpWidth == 128)
20579     return Subtarget.hasCmpxchg16b();
20580   else
20581     return false;
20582 }
20583
20584 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
20585   return needsCmpXchgNb(SI->getValueOperand()->getType());
20586 }
20587
20588 // Note: this turns large loads into lock cmpxchg8b/16b.
20589 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
20590 TargetLowering::AtomicExpansionKind
20591 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
20592   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
20593   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
20594                                                : AtomicExpansionKind::None;
20595 }
20596
20597 TargetLowering::AtomicExpansionKind
20598 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
20599   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20600   Type *MemType = AI->getType();
20601
20602   // If the operand is too big, we must see if cmpxchg8/16b is available
20603   // and default to library calls otherwise.
20604   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
20605     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
20606                                    : AtomicExpansionKind::None;
20607   }
20608
20609   AtomicRMWInst::BinOp Op = AI->getOperation();
20610   switch (Op) {
20611   default:
20612     llvm_unreachable("Unknown atomic operation");
20613   case AtomicRMWInst::Xchg:
20614   case AtomicRMWInst::Add:
20615   case AtomicRMWInst::Sub:
20616     // It's better to use xadd, xsub or xchg for these in all cases.
20617     return AtomicExpansionKind::None;
20618   case AtomicRMWInst::Or:
20619   case AtomicRMWInst::And:
20620   case AtomicRMWInst::Xor:
20621     // If the atomicrmw's result isn't actually used, we can just add a "lock"
20622     // prefix to a normal instruction for these operations.
20623     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
20624                             : AtomicExpansionKind::None;
20625   case AtomicRMWInst::Nand:
20626   case AtomicRMWInst::Max:
20627   case AtomicRMWInst::Min:
20628   case AtomicRMWInst::UMax:
20629   case AtomicRMWInst::UMin:
20630     // These always require a non-trivial set of data operations on x86. We must
20631     // use a cmpxchg loop.
20632     return AtomicExpansionKind::CmpXChg;
20633   }
20634 }
20635
20636 LoadInst *
20637 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
20638   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20639   Type *MemType = AI->getType();
20640   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
20641   // there is no benefit in turning such RMWs into loads, and it is actually
20642   // harmful as it introduces a mfence.
20643   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
20644     return nullptr;
20645
20646   auto Builder = IRBuilder<>(AI);
20647   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20648   auto SynchScope = AI->getSynchScope();
20649   // We must restrict the ordering to avoid generating loads with Release or
20650   // ReleaseAcquire orderings.
20651   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
20652   auto Ptr = AI->getPointerOperand();
20653
20654   // Before the load we need a fence. Here is an example lifted from
20655   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
20656   // is required:
20657   // Thread 0:
20658   //   x.store(1, relaxed);
20659   //   r1 = y.fetch_add(0, release);
20660   // Thread 1:
20661   //   y.fetch_add(42, acquire);
20662   //   r2 = x.load(relaxed);
20663   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
20664   // lowered to just a load without a fence. A mfence flushes the store buffer,
20665   // making the optimization clearly correct.
20666   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
20667   // otherwise, we might be able to be more aggressive on relaxed idempotent
20668   // rmw. In practice, they do not look useful, so we don't try to be
20669   // especially clever.
20670   if (SynchScope == SingleThread)
20671     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
20672     // the IR level, so we must wrap it in an intrinsic.
20673     return nullptr;
20674
20675   if (!Subtarget.hasMFence())
20676     // FIXME: it might make sense to use a locked operation here but on a
20677     // different cache-line to prevent cache-line bouncing. In practice it
20678     // is probably a small win, and x86 processors without mfence are rare
20679     // enough that we do not bother.
20680     return nullptr;
20681
20682   Function *MFence =
20683       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
20684   Builder.CreateCall(MFence, {});
20685
20686   // Finally we can emit the atomic load.
20687   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
20688           AI->getType()->getPrimitiveSizeInBits());
20689   Loaded->setAtomic(Order, SynchScope);
20690   AI->replaceAllUsesWith(Loaded);
20691   AI->eraseFromParent();
20692   return Loaded;
20693 }
20694
20695 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
20696                                  SelectionDAG &DAG) {
20697   SDLoc dl(Op);
20698   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
20699     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
20700   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
20701     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
20702
20703   // The only fence that needs an instruction is a sequentially-consistent
20704   // cross-thread fence.
20705   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
20706       FenceScope == CrossThread) {
20707     if (Subtarget.hasMFence())
20708       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
20709
20710     SDValue Chain = Op.getOperand(0);
20711     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
20712     SDValue Ops[] = {
20713       DAG.getRegister(X86::ESP, MVT::i32),     // Base
20714       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
20715       DAG.getRegister(0, MVT::i32),            // Index
20716       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
20717       DAG.getRegister(0, MVT::i32),            // Segment.
20718       Zero,
20719       Chain
20720     };
20721     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
20722     return SDValue(Res, 0);
20723   }
20724
20725   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
20726   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
20727 }
20728
20729 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
20730                              SelectionDAG &DAG) {
20731   MVT T = Op.getSimpleValueType();
20732   SDLoc DL(Op);
20733   unsigned Reg = 0;
20734   unsigned size = 0;
20735   switch(T.SimpleTy) {
20736   default: llvm_unreachable("Invalid value type!");
20737   case MVT::i8:  Reg = X86::AL;  size = 1; break;
20738   case MVT::i16: Reg = X86::AX;  size = 2; break;
20739   case MVT::i32: Reg = X86::EAX; size = 4; break;
20740   case MVT::i64:
20741     assert(Subtarget.is64Bit() && "Node not type legal!");
20742     Reg = X86::RAX; size = 8;
20743     break;
20744   }
20745   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
20746                                   Op.getOperand(2), SDValue());
20747   SDValue Ops[] = { cpIn.getValue(0),
20748                     Op.getOperand(1),
20749                     Op.getOperand(3),
20750                     DAG.getTargetConstant(size, DL, MVT::i8),
20751                     cpIn.getValue(1) };
20752   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20753   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
20754   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
20755                                            Ops, T, MMO);
20756
20757   SDValue cpOut =
20758     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
20759   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
20760                                       MVT::i32, cpOut.getValue(2));
20761   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
20762                                 DAG.getConstant(X86::COND_E, DL, MVT::i8),
20763                                 EFLAGS);
20764
20765   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
20766   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
20767   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
20768   return SDValue();
20769 }
20770
20771 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
20772                             SelectionDAG &DAG) {
20773   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
20774   MVT DstVT = Op.getSimpleValueType();
20775
20776   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
20777       SrcVT == MVT::i64) {
20778     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
20779     if (DstVT != MVT::f64)
20780       // This conversion needs to be expanded.
20781       return SDValue();
20782
20783     SDValue Op0 = Op->getOperand(0);
20784     SmallVector<SDValue, 16> Elts;
20785     SDLoc dl(Op);
20786     unsigned NumElts;
20787     MVT SVT;
20788     if (SrcVT.isVector()) {
20789       NumElts = SrcVT.getVectorNumElements();
20790       SVT = SrcVT.getVectorElementType();
20791
20792       // Widen the vector in input in the case of MVT::v2i32.
20793       // Example: from MVT::v2i32 to MVT::v4i32.
20794       for (unsigned i = 0, e = NumElts; i != e; ++i)
20795         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
20796                                    DAG.getIntPtrConstant(i, dl)));
20797     } else {
20798       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
20799              "Unexpected source type in LowerBITCAST");
20800       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20801                                  DAG.getIntPtrConstant(0, dl)));
20802       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20803                                  DAG.getIntPtrConstant(1, dl)));
20804       NumElts = 2;
20805       SVT = MVT::i32;
20806     }
20807     // Explicitly mark the extra elements as Undef.
20808     Elts.append(NumElts, DAG.getUNDEF(SVT));
20809
20810     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20811     SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
20812     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
20813     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
20814                        DAG.getIntPtrConstant(0, dl));
20815   }
20816
20817   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
20818          Subtarget.hasMMX() && "Unexpected custom BITCAST");
20819   assert((DstVT == MVT::i64 ||
20820           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
20821          "Unexpected custom BITCAST");
20822   // i64 <=> MMX conversions are Legal.
20823   if (SrcVT==MVT::i64 && DstVT.isVector())
20824     return Op;
20825   if (DstVT==MVT::i64 && SrcVT.isVector())
20826     return Op;
20827   // MMX <=> MMX conversions are Legal.
20828   if (SrcVT.isVector() && DstVT.isVector())
20829     return Op;
20830   // All other conversions need to be expanded.
20831   return SDValue();
20832 }
20833
20834 /// Compute the horizontal sum of bytes in V for the elements of VT.
20835 ///
20836 /// Requires V to be a byte vector and VT to be an integer vector type with
20837 /// wider elements than V's type. The width of the elements of VT determines
20838 /// how many bytes of V are summed horizontally to produce each element of the
20839 /// result.
20840 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
20841                                       const X86Subtarget &Subtarget,
20842                                       SelectionDAG &DAG) {
20843   SDLoc DL(V);
20844   MVT ByteVecVT = V.getSimpleValueType();
20845   MVT EltVT = VT.getVectorElementType();
20846   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
20847          "Expected value to have byte element type.");
20848   assert(EltVT != MVT::i8 &&
20849          "Horizontal byte sum only makes sense for wider elements!");
20850   unsigned VecSize = VT.getSizeInBits();
20851   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
20852
20853   // PSADBW instruction horizontally add all bytes and leave the result in i64
20854   // chunks, thus directly computes the pop count for v2i64 and v4i64.
20855   if (EltVT == MVT::i64) {
20856     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20857     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20858     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
20859     return DAG.getBitcast(VT, V);
20860   }
20861
20862   if (EltVT == MVT::i32) {
20863     // We unpack the low half and high half into i32s interleaved with zeros so
20864     // that we can use PSADBW to horizontally sum them. The most useful part of
20865     // this is that it lines up the results of two PSADBW instructions to be
20866     // two v2i64 vectors which concatenated are the 4 population counts. We can
20867     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
20868     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
20869     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
20870     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
20871
20872     // Do the horizontal sums into two v2i64s.
20873     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20874     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20875     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20876                       DAG.getBitcast(ByteVecVT, Low), Zeros);
20877     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20878                        DAG.getBitcast(ByteVecVT, High), Zeros);
20879
20880     // Merge them together.
20881     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
20882     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
20883                     DAG.getBitcast(ShortVecVT, Low),
20884                     DAG.getBitcast(ShortVecVT, High));
20885
20886     return DAG.getBitcast(VT, V);
20887   }
20888
20889   // The only element type left is i16.
20890   assert(EltVT == MVT::i16 && "Unknown how to handle type");
20891
20892   // To obtain pop count for each i16 element starting from the pop count for
20893   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
20894   // right by 8. It is important to shift as i16s as i8 vector shift isn't
20895   // directly supported.
20896   SDValue ShifterV = DAG.getConstant(8, DL, VT);
20897   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20898   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
20899                   DAG.getBitcast(ByteVecVT, V));
20900   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20901 }
20902
20903 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
20904                                         const X86Subtarget &Subtarget,
20905                                         SelectionDAG &DAG) {
20906   MVT VT = Op.getSimpleValueType();
20907   MVT EltVT = VT.getVectorElementType();
20908   unsigned VecSize = VT.getSizeInBits();
20909
20910   // Implement a lookup table in register by using an algorithm based on:
20911   // http://wm.ite.pl/articles/sse-popcount.html
20912   //
20913   // The general idea is that every lower byte nibble in the input vector is an
20914   // index into a in-register pre-computed pop count table. We then split up the
20915   // input vector in two new ones: (1) a vector with only the shifted-right
20916   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
20917   // masked out higher ones) for each byte. PSHUB is used separately with both
20918   // to index the in-register table. Next, both are added and the result is a
20919   // i8 vector where each element contains the pop count for input byte.
20920   //
20921   // To obtain the pop count for elements != i8, we follow up with the same
20922   // approach and use additional tricks as described below.
20923   //
20924   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
20925                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
20926                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
20927                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
20928
20929   int NumByteElts = VecSize / 8;
20930   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
20931   SDValue In = DAG.getBitcast(ByteVecVT, Op);
20932   SmallVector<SDValue, 64> LUTVec;
20933   for (int i = 0; i < NumByteElts; ++i)
20934     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20935   SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
20936   SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
20937
20938   // High nibbles
20939   SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
20940   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
20941
20942   // Low nibbles
20943   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
20944
20945   // The input vector is used as the shuffle mask that index elements into the
20946   // LUT. After counting low and high nibbles, add the vector to obtain the
20947   // final pop count per i8 element.
20948   SDValue HighPopCnt =
20949       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
20950   SDValue LowPopCnt =
20951       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
20952   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
20953
20954   if (EltVT == MVT::i8)
20955     return PopCnt;
20956
20957   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
20958 }
20959
20960 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
20961                                        const X86Subtarget &Subtarget,
20962                                        SelectionDAG &DAG) {
20963   MVT VT = Op.getSimpleValueType();
20964   assert(VT.is128BitVector() &&
20965          "Only 128-bit vector bitmath lowering supported.");
20966
20967   int VecSize = VT.getSizeInBits();
20968   MVT EltVT = VT.getVectorElementType();
20969   int Len = EltVT.getSizeInBits();
20970
20971   // This is the vectorized version of the "best" algorithm from
20972   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
20973   // with a minor tweak to use a series of adds + shifts instead of vector
20974   // multiplications. Implemented for all integer vector types. We only use
20975   // this when we don't have SSSE3 which allows a LUT-based lowering that is
20976   // much faster, even faster than using native popcnt instructions.
20977
20978   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
20979     MVT VT = V.getSimpleValueType();
20980     SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
20981     return DAG.getNode(OpCode, DL, VT, V, ShifterV);
20982   };
20983   auto GetMask = [&](SDValue V, APInt Mask) {
20984     MVT VT = V.getSimpleValueType();
20985     SDValue MaskV = DAG.getConstant(Mask, DL, VT);
20986     return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
20987   };
20988
20989   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
20990   // x86, so set the SRL type to have elements at least i16 wide. This is
20991   // correct because all of our SRLs are followed immediately by a mask anyways
20992   // that handles any bits that sneak into the high bits of the byte elements.
20993   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
20994
20995   SDValue V = Op;
20996
20997   // v = v - ((v >> 1) & 0x55555555...)
20998   SDValue Srl =
20999       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
21000   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
21001   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
21002
21003   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
21004   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
21005   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
21006   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
21007   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
21008
21009   // v = (v + (v >> 4)) & 0x0F0F0F0F...
21010   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
21011   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
21012   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
21013
21014   // At this point, V contains the byte-wise population count, and we are
21015   // merely doing a horizontal sum if necessary to get the wider element
21016   // counts.
21017   if (EltVT == MVT::i8)
21018     return V;
21019
21020   return LowerHorizontalByteSum(
21021       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
21022       DAG);
21023 }
21024
21025 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21026                                 SelectionDAG &DAG) {
21027   MVT VT = Op.getSimpleValueType();
21028   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
21029          "Unknown CTPOP type to handle");
21030   SDLoc DL(Op.getNode());
21031   SDValue Op0 = Op.getOperand(0);
21032
21033   if (!Subtarget.hasSSSE3()) {
21034     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
21035     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
21036     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
21037   }
21038
21039   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21040     unsigned NumElems = VT.getVectorNumElements();
21041
21042     // Extract each 128-bit vector, compute pop count and concat the result.
21043     SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
21044     SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
21045
21046     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21047                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21048                        LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21049   }
21050
21051   if (VT.is512BitVector() && !Subtarget.hasBWI()) {
21052     unsigned NumElems = VT.getVectorNumElements();
21053
21054     // Extract each 256-bit vector, compute pop count and concat the result.
21055     SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
21056     SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
21057
21058     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21059                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21060                        LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21061   }
21062
21063   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
21064 }
21065
21066 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21067                           SelectionDAG &DAG) {
21068   assert(Op.getSimpleValueType().isVector() &&
21069          "We only do custom lowering for vector population count.");
21070   return LowerVectorCTPOP(Op, Subtarget, DAG);
21071 }
21072
21073 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
21074   MVT VT = Op.getSimpleValueType();
21075   SDValue In = Op.getOperand(0);
21076   SDLoc DL(Op);
21077
21078   // For scalars, its still beneficial to transfer to/from the SIMD unit to
21079   // perform the BITREVERSE.
21080   if (!VT.isVector()) {
21081     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
21082     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
21083     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
21084     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
21085                        DAG.getIntPtrConstant(0, DL));
21086   }
21087
21088   MVT SVT = VT.getVectorElementType();
21089   int NumElts = VT.getVectorNumElements();
21090   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
21091
21092   // Decompose 256-bit ops into smaller 128-bit ops.
21093   if (VT.is256BitVector()) {
21094     SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21095     SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21096
21097     MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
21098     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21099                        DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
21100                        DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
21101   }
21102
21103   assert(VT.is128BitVector() &&
21104          "Only 128-bit vector bitreverse lowering supported.");
21105
21106   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
21107   // perform the BSWAP in the shuffle.
21108   // Its best to shuffle using the second operand as this will implicitly allow
21109   // memory folding for multiple vectors.
21110   SmallVector<SDValue, 16> MaskElts;
21111   for (int i = 0; i != NumElts; ++i) {
21112     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
21113       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
21114       int PermuteByte = SourceByte | (2 << 5);
21115       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
21116     }
21117   }
21118
21119   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
21120   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
21121   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
21122                     Res, Mask);
21123   return DAG.getBitcast(VT, Res);
21124 }
21125
21126 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
21127                                SelectionDAG &DAG) {
21128   if (Subtarget.hasXOP())
21129     return LowerBITREVERSE_XOP(Op, DAG);
21130
21131   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
21132
21133   MVT VT = Op.getSimpleValueType();
21134   SDValue In = Op.getOperand(0);
21135   SDLoc DL(Op);
21136
21137   unsigned NumElts = VT.getVectorNumElements();
21138   assert(VT.getScalarType() == MVT::i8 &&
21139          "Only byte vector BITREVERSE supported");
21140
21141   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
21142   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21143     MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
21144     SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21145     SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21146     Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
21147     Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
21148     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21149   }
21150
21151   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
21152   // two nibbles and a PSHUFB lookup to find the bitreverse of each
21153   // 0-15 value (moved to the other nibble).
21154   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
21155   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
21156   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
21157
21158   const int LoLUT[16] = {
21159       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
21160       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
21161       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
21162       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
21163   const int HiLUT[16] = {
21164       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
21165       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
21166       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
21167       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
21168
21169   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
21170   for (unsigned i = 0; i < NumElts; ++i) {
21171     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
21172     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
21173   }
21174
21175   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
21176   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
21177   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
21178   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
21179   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
21180 }
21181
21182 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
21183   unsigned NewOpc = 0;
21184   switch (N->getOpcode()) {
21185   case ISD::ATOMIC_LOAD_ADD:
21186     NewOpc = X86ISD::LADD;
21187     break;
21188   case ISD::ATOMIC_LOAD_SUB:
21189     NewOpc = X86ISD::LSUB;
21190     break;
21191   case ISD::ATOMIC_LOAD_OR:
21192     NewOpc = X86ISD::LOR;
21193     break;
21194   case ISD::ATOMIC_LOAD_XOR:
21195     NewOpc = X86ISD::LXOR;
21196     break;
21197   case ISD::ATOMIC_LOAD_AND:
21198     NewOpc = X86ISD::LAND;
21199     break;
21200   default:
21201     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
21202   }
21203
21204   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
21205   return DAG.getMemIntrinsicNode(
21206       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
21207       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
21208       /*MemVT=*/N->getSimpleValueType(0), MMO);
21209 }
21210
21211 /// Lower atomic_load_ops into LOCK-prefixed operations.
21212 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
21213                                 const X86Subtarget &Subtarget) {
21214   SDValue Chain = N->getOperand(0);
21215   SDValue LHS = N->getOperand(1);
21216   SDValue RHS = N->getOperand(2);
21217   unsigned Opc = N->getOpcode();
21218   MVT VT = N->getSimpleValueType(0);
21219   SDLoc DL(N);
21220
21221   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
21222   // can only be lowered when the result is unused.  They should have already
21223   // been transformed into a cmpxchg loop in AtomicExpand.
21224   if (N->hasAnyUseOfValue(0)) {
21225     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
21226     // select LXADD if LOCK_SUB can't be selected.
21227     if (Opc == ISD::ATOMIC_LOAD_SUB) {
21228       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
21229       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
21230       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
21231                            RHS, AN->getMemOperand(), AN->getOrdering(),
21232                            AN->getSynchScope());
21233     }
21234     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
21235            "Used AtomicRMW ops other than Add should have been expanded!");
21236     return N;
21237   }
21238
21239   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
21240   // RAUW the chain, but don't worry about the result, as it's unused.
21241   assert(!N->hasAnyUseOfValue(0));
21242   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
21243   return SDValue();
21244 }
21245
21246 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
21247   SDNode *Node = Op.getNode();
21248   SDLoc dl(Node);
21249   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
21250
21251   // Convert seq_cst store -> xchg
21252   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
21253   // FIXME: On 32-bit, store -> fist or movq would be more efficient
21254   //        (The only way to get a 16-byte store is cmpxchg16b)
21255   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
21256   if (cast<AtomicSDNode>(Node)->getOrdering() ==
21257           AtomicOrdering::SequentiallyConsistent ||
21258       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
21259     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
21260                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
21261                                  Node->getOperand(0),
21262                                  Node->getOperand(1), Node->getOperand(2),
21263                                  cast<AtomicSDNode>(Node)->getMemOperand(),
21264                                  cast<AtomicSDNode>(Node)->getOrdering(),
21265                                  cast<AtomicSDNode>(Node)->getSynchScope());
21266     return Swap.getValue(1);
21267   }
21268   // Other atomic stores have a simple pattern.
21269   return Op;
21270 }
21271
21272 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
21273   MVT VT = Op.getNode()->getSimpleValueType(0);
21274
21275   // Let legalize expand this if it isn't a legal type yet.
21276   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
21277     return SDValue();
21278
21279   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21280
21281   unsigned Opc;
21282   bool ExtraOp = false;
21283   switch (Op.getOpcode()) {
21284   default: llvm_unreachable("Invalid code");
21285   case ISD::ADDC: Opc = X86ISD::ADD; break;
21286   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
21287   case ISD::SUBC: Opc = X86ISD::SUB; break;
21288   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
21289   }
21290
21291   if (!ExtraOp)
21292     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21293                        Op.getOperand(1));
21294   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21295                      Op.getOperand(1), Op.getOperand(2));
21296 }
21297
21298 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
21299                             SelectionDAG &DAG) {
21300   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
21301
21302   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
21303   // which returns the values as { float, float } (in XMM0) or
21304   // { double, double } (which is returned in XMM0, XMM1).
21305   SDLoc dl(Op);
21306   SDValue Arg = Op.getOperand(0);
21307   EVT ArgVT = Arg.getValueType();
21308   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21309
21310   TargetLowering::ArgListTy Args;
21311   TargetLowering::ArgListEntry Entry;
21312
21313   Entry.Node = Arg;
21314   Entry.Ty = ArgTy;
21315   Entry.isSExt = false;
21316   Entry.isZExt = false;
21317   Args.push_back(Entry);
21318
21319   bool isF64 = ArgVT == MVT::f64;
21320   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
21321   // the small struct {f32, f32} is returned in (eax, edx). For f64,
21322   // the results are returned via SRet in memory.
21323   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
21324   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21325   SDValue Callee =
21326       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
21327
21328   Type *RetTy = isF64
21329     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
21330     : (Type*)VectorType::get(ArgTy, 4);
21331
21332   TargetLowering::CallLoweringInfo CLI(DAG);
21333   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
21334     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
21335
21336   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
21337
21338   if (isF64)
21339     // Returned in xmm0 and xmm1.
21340     return CallResult.first;
21341
21342   // Returned in bits 0:31 and 32:64 xmm0.
21343   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21344                                CallResult.first, DAG.getIntPtrConstant(0, dl));
21345   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21346                                CallResult.first, DAG.getIntPtrConstant(1, dl));
21347   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
21348   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
21349 }
21350
21351 /// Widen a vector input to a vector of NVT.  The
21352 /// input vector must have the same element type as NVT.
21353 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
21354                             bool FillWithZeroes = false) {
21355   // Check if InOp already has the right width.
21356   MVT InVT = InOp.getSimpleValueType();
21357   if (InVT == NVT)
21358     return InOp;
21359
21360   if (InOp.isUndef())
21361     return DAG.getUNDEF(NVT);
21362
21363   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
21364          "input and widen element type must match");
21365
21366   unsigned InNumElts = InVT.getVectorNumElements();
21367   unsigned WidenNumElts = NVT.getVectorNumElements();
21368   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
21369          "Unexpected request for vector widening");
21370
21371   EVT EltVT = NVT.getVectorElementType();
21372
21373   SDLoc dl(InOp);
21374   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
21375       InOp.getNumOperands() == 2) {
21376     SDValue N1 = InOp.getOperand(1);
21377     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
21378         N1.isUndef()) {
21379       InOp = InOp.getOperand(0);
21380       InVT = InOp.getSimpleValueType();
21381       InNumElts = InVT.getVectorNumElements();
21382     }
21383   }
21384   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
21385       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
21386     SmallVector<SDValue, 16> Ops;
21387     for (unsigned i = 0; i < InNumElts; ++i)
21388       Ops.push_back(InOp.getOperand(i));
21389
21390     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
21391       DAG.getUNDEF(EltVT);
21392     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
21393       Ops.push_back(FillVal);
21394     return DAG.getBuildVector(NVT, dl, Ops);
21395   }
21396   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
21397     DAG.getUNDEF(NVT);
21398   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
21399                      InOp, DAG.getIntPtrConstant(0, dl));
21400 }
21401
21402 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
21403                              SelectionDAG &DAG) {
21404   assert(Subtarget.hasAVX512() &&
21405          "MGATHER/MSCATTER are supported on AVX-512 arch only");
21406
21407   // X86 scatter kills mask register, so its type should be added to
21408   // the list of return values.
21409   // If the "scatter" has 2 return values, it is already handled.
21410   if (Op.getNode()->getNumValues() == 2)
21411     return Op;
21412
21413   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
21414   SDValue Src = N->getValue();
21415   MVT VT = Src.getSimpleValueType();
21416   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
21417   SDLoc dl(Op);
21418
21419   SDValue NewScatter;
21420   SDValue Index = N->getIndex();
21421   SDValue Mask = N->getMask();
21422   SDValue Chain = N->getChain();
21423   SDValue BasePtr = N->getBasePtr();
21424   MVT MemVT = N->getMemoryVT().getSimpleVT();
21425   MVT IndexVT = Index.getSimpleValueType();
21426   MVT MaskVT = Mask.getSimpleValueType();
21427
21428   if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
21429     // The v2i32 value was promoted to v2i64.
21430     // Now we "redo" the type legalizer's work and widen the original
21431     // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
21432     // with a shuffle.
21433     assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
21434            "Unexpected memory type");
21435     int ShuffleMask[] = {0, 2, -1, -1};
21436     Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
21437                                DAG.getUNDEF(MVT::v4i32), ShuffleMask);
21438     // Now we have 4 elements instead of 2.
21439     // Expand the index.
21440     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
21441     Index = ExtendToType(Index, NewIndexVT, DAG);
21442
21443     // Expand the mask with zeroes
21444     // Mask may be <2 x i64> or <2 x i1> at this moment
21445     assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
21446            "Unexpected mask type");
21447     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
21448     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21449     VT = MVT::v4i32;
21450   }
21451
21452   unsigned NumElts = VT.getVectorNumElements();
21453   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21454       !Index.getSimpleValueType().is512BitVector()) {
21455     // AVX512F supports only 512-bit vectors. Or data or index should
21456     // be 512 bit wide. If now the both index and data are 256-bit, but
21457     // the vector contains 8 elements, we just sign-extend the index
21458     if (IndexVT == MVT::v8i32)
21459       // Just extend index
21460       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21461     else {
21462       // The minimal number of elts in scatter is 8
21463       NumElts = 8;
21464       // Index
21465       MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21466       // Use original index here, do not modify the index twice
21467       Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
21468       if (IndexVT.getScalarType() == MVT::i32)
21469         Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21470
21471       // Mask
21472       // At this point we have promoted mask operand
21473       assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21474       MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21475       // Use the original mask here, do not modify the mask twice
21476       Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
21477
21478       // The value that should be stored
21479       MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21480       Src = ExtendToType(Src, NewVT, DAG);
21481     }
21482   }
21483   // If the mask is "wide" at this point - truncate it to i1 vector
21484   MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
21485   Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
21486
21487   // The mask is killed by scatter, add it to the values
21488   SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
21489   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
21490   NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
21491                                     N->getMemOperand());
21492   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
21493   return SDValue(NewScatter.getNode(), 1);
21494 }
21495
21496 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
21497                           SelectionDAG &DAG) {
21498
21499   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
21500   MVT VT = Op.getSimpleValueType();
21501   MVT ScalarVT = VT.getScalarType();
21502   SDValue Mask = N->getMask();
21503   SDLoc dl(Op);
21504
21505   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21506          "Cannot lower masked load op.");
21507
21508   assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21509           (Subtarget.hasBWI() &&
21510               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21511          "Unsupported masked load op.");
21512
21513   // This operation is legal for targets with VLX, but without
21514   // VLX the vector should be widened to 512 bit
21515   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21516   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21517   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21518   SDValue Src0 = N->getSrc0();
21519   Src0 = ExtendToType(Src0, WideDataVT, DAG);
21520   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21521   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
21522                                       N->getBasePtr(), Mask, Src0,
21523                                       N->getMemoryVT(), N->getMemOperand(),
21524                                       N->getExtensionType());
21525
21526   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21527                                NewLoad.getValue(0),
21528                                DAG.getIntPtrConstant(0, dl));
21529   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
21530   return DAG.getMergeValues(RetOps, dl);
21531 }
21532
21533 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
21534                            SelectionDAG &DAG) {
21535   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
21536   SDValue DataToStore = N->getValue();
21537   MVT VT = DataToStore.getSimpleValueType();
21538   MVT ScalarVT = VT.getScalarType();
21539   SDValue Mask = N->getMask();
21540   SDLoc dl(Op);
21541
21542   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21543          "Cannot lower masked store op.");
21544
21545   assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21546           (Subtarget.hasBWI() &&
21547               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21548           "Unsupported masked store op.");
21549
21550   // This operation is legal for targets with VLX, but without
21551   // VLX the vector should be widened to 512 bit
21552   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21553   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21554   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21555   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
21556   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21557   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
21558                             Mask, N->getMemoryVT(), N->getMemOperand(),
21559                             N->isTruncatingStore());
21560 }
21561
21562 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
21563                             SelectionDAG &DAG) {
21564   assert(Subtarget.hasAVX512() &&
21565          "MGATHER/MSCATTER are supported on AVX-512 arch only");
21566
21567   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
21568   SDLoc dl(Op);
21569   MVT VT = Op.getSimpleValueType();
21570   SDValue Index = N->getIndex();
21571   SDValue Mask = N->getMask();
21572   SDValue Src0 = N->getValue();
21573   MVT IndexVT = Index.getSimpleValueType();
21574   MVT MaskVT = Mask.getSimpleValueType();
21575
21576   unsigned NumElts = VT.getVectorNumElements();
21577   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
21578
21579   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21580       !Index.getSimpleValueType().is512BitVector()) {
21581     // AVX512F supports only 512-bit vectors. Or data or index should
21582     // be 512 bit wide. If now the both index and data are 256-bit, but
21583     // the vector contains 8 elements, we just sign-extend the index
21584     if (NumElts == 8) {
21585       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21586       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
21587                         N->getOperand(3), Index };
21588       DAG.UpdateNodeOperands(N, Ops);
21589       return Op;
21590     }
21591
21592     // Minimal number of elements in Gather
21593     NumElts = 8;
21594     // Index
21595     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21596     Index = ExtendToType(Index, NewIndexVT, DAG);
21597     if (IndexVT.getScalarType() == MVT::i32)
21598       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21599
21600     // Mask
21601     MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
21602     // At this point we have promoted mask operand
21603     assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21604     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21605     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21606     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
21607
21608     // The pass-thru value
21609     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21610     Src0 = ExtendToType(Src0, NewVT, DAG);
21611
21612     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
21613     SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
21614                                             N->getMemoryVT(), dl, Ops,
21615                                             N->getMemOperand());
21616     SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21617                                  NewGather.getValue(0),
21618                                  DAG.getIntPtrConstant(0, dl));
21619     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
21620     return DAG.getMergeValues(RetOps, dl);
21621   }
21622   return Op;
21623 }
21624
21625 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
21626                                                     SelectionDAG &DAG) const {
21627   // TODO: Eventually, the lowering of these nodes should be informed by or
21628   // deferred to the GC strategy for the function in which they appear. For
21629   // now, however, they must be lowered to something. Since they are logically
21630   // no-ops in the case of a null GC strategy (or a GC strategy which does not
21631   // require special handling for these nodes), lower them as literal NOOPs for
21632   // the time being.
21633   SmallVector<SDValue, 2> Ops;
21634
21635   Ops.push_back(Op.getOperand(0));
21636   if (Op->getGluedNode())
21637     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21638
21639   SDLoc OpDL(Op);
21640   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21641   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21642
21643   return NOOP;
21644 }
21645
21646 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
21647                                                   SelectionDAG &DAG) const {
21648   // TODO: Eventually, the lowering of these nodes should be informed by or
21649   // deferred to the GC strategy for the function in which they appear. For
21650   // now, however, they must be lowered to something. Since they are logically
21651   // no-ops in the case of a null GC strategy (or a GC strategy which does not
21652   // require special handling for these nodes), lower them as literal NOOPs for
21653   // the time being.
21654   SmallVector<SDValue, 2> Ops;
21655
21656   Ops.push_back(Op.getOperand(0));
21657   if (Op->getGluedNode())
21658     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21659
21660   SDLoc OpDL(Op);
21661   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21662   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21663
21664   return NOOP;
21665 }
21666
21667 /// Provide custom lowering hooks for some operations.
21668 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
21669   switch (Op.getOpcode()) {
21670   default: llvm_unreachable("Should not custom lower this!");
21671   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
21672   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
21673     return LowerCMP_SWAP(Op, Subtarget, DAG);
21674   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
21675   case ISD::ATOMIC_LOAD_ADD:
21676   case ISD::ATOMIC_LOAD_SUB:
21677   case ISD::ATOMIC_LOAD_OR:
21678   case ISD::ATOMIC_LOAD_XOR:
21679   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
21680   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
21681   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
21682   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
21683   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
21684   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
21685   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
21686   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
21687   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
21688   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
21689   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
21690   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
21691   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
21692   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
21693   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
21694   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
21695   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
21696   case ISD::SHL_PARTS:
21697   case ISD::SRA_PARTS:
21698   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
21699   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
21700   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
21701   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
21702   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
21703   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
21704   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
21705   case ISD::SIGN_EXTEND_VECTOR_INREG:
21706     return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
21707   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
21708   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
21709   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
21710   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
21711   case ISD::FABS:
21712   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
21713   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
21714   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
21715   case ISD::SETCC:              return LowerSETCC(Op, DAG);
21716   case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
21717   case ISD::SELECT:             return LowerSELECT(Op, DAG);
21718   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
21719   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
21720   case ISD::VASTART:            return LowerVASTART(Op, DAG);
21721   case ISD::VAARG:              return LowerVAARG(Op, DAG);
21722   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
21723   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
21724   case ISD::INTRINSIC_VOID:
21725   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
21726   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
21727   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
21728   case ISD::FRAME_TO_ARGS_OFFSET:
21729                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
21730   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
21731   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
21732   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
21733   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
21734   case ISD::EH_SJLJ_SETUP_DISPATCH:
21735     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
21736   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
21737   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
21738   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
21739   case ISD::CTLZ:
21740   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
21741   case ISD::CTTZ:
21742   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
21743   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
21744   case ISD::MULHS:
21745   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
21746   case ISD::UMUL_LOHI:
21747   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
21748   case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
21749   case ISD::SRA:
21750   case ISD::SRL:
21751   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
21752   case ISD::SADDO:
21753   case ISD::UADDO:
21754   case ISD::SSUBO:
21755   case ISD::USUBO:
21756   case ISD::SMULO:
21757   case ISD::UMULO:              return LowerXALUO(Op, DAG);
21758   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
21759   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
21760   case ISD::ADDC:
21761   case ISD::ADDE:
21762   case ISD::SUBC:
21763   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
21764   case ISD::ADD:                return LowerADD(Op, DAG);
21765   case ISD::SUB:                return LowerSUB(Op, DAG);
21766   case ISD::SMAX:
21767   case ISD::SMIN:
21768   case ISD::UMAX:
21769   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
21770   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
21771   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
21772   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
21773   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
21774   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
21775   case ISD::GC_TRANSITION_START:
21776                                 return LowerGC_TRANSITION_START(Op, DAG);
21777   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
21778   case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
21779   }
21780 }
21781
21782 /// Places new result values for the node in Results (their number
21783 /// and types must exactly match those of the original return values of
21784 /// the node), or leaves Results empty, which indicates that the node is not
21785 /// to be custom lowered after all.
21786 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
21787                                               SmallVectorImpl<SDValue> &Results,
21788                                               SelectionDAG &DAG) const {
21789   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
21790
21791   if (!Res.getNode())
21792     return;
21793
21794   assert((N->getNumValues() <= Res->getNumValues()) &&
21795       "Lowering returned the wrong number of results!");
21796
21797   // Places new result values base on N result number.
21798   // In some cases (LowerSINT_TO_FP for example) Res has more result values
21799   // than original node, chain should be dropped(last value).
21800   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
21801       Results.push_back(Res.getValue(I));
21802 }
21803
21804 /// Replace a node with an illegal result type with a new node built out of
21805 /// custom code.
21806 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
21807                                            SmallVectorImpl<SDValue>&Results,
21808                                            SelectionDAG &DAG) const {
21809   SDLoc dl(N);
21810   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21811   switch (N->getOpcode()) {
21812   default:
21813     llvm_unreachable("Do not know how to custom type legalize this operation!");
21814   case X86ISD::AVG: {
21815     // Legalize types for X86ISD::AVG by expanding vectors.
21816     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21817
21818     auto InVT = N->getValueType(0);
21819     auto InVTSize = InVT.getSizeInBits();
21820     const unsigned RegSize =
21821         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
21822     assert((!Subtarget.hasAVX512() || RegSize < 512) &&
21823            "512-bit vector requires AVX512");
21824     assert((!Subtarget.hasAVX2() || RegSize < 256) &&
21825            "256-bit vector requires AVX2");
21826
21827     auto ElemVT = InVT.getVectorElementType();
21828     auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
21829                                   RegSize / ElemVT.getSizeInBits());
21830     assert(RegSize % InVT.getSizeInBits() == 0);
21831     unsigned NumConcat = RegSize / InVT.getSizeInBits();
21832
21833     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
21834     Ops[0] = N->getOperand(0);
21835     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21836     Ops[0] = N->getOperand(1);
21837     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21838
21839     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
21840     Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
21841                                   DAG.getIntPtrConstant(0, dl)));
21842     return;
21843   }
21844   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
21845   case X86ISD::FMINC:
21846   case X86ISD::FMIN:
21847   case X86ISD::FMAXC:
21848   case X86ISD::FMAX: {
21849     EVT VT = N->getValueType(0);
21850     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
21851     SDValue UNDEF = DAG.getUNDEF(VT);
21852     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21853                               N->getOperand(0), UNDEF);
21854     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21855                               N->getOperand(1), UNDEF);
21856     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
21857     return;
21858   }
21859   case ISD::SIGN_EXTEND_INREG:
21860   case ISD::ADDC:
21861   case ISD::ADDE:
21862   case ISD::SUBC:
21863   case ISD::SUBE:
21864     // We don't want to expand or promote these.
21865     return;
21866   case ISD::SDIV:
21867   case ISD::UDIV:
21868   case ISD::SREM:
21869   case ISD::UREM:
21870   case ISD::SDIVREM:
21871   case ISD::UDIVREM: {
21872     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
21873     Results.push_back(V);
21874     return;
21875   }
21876   case ISD::FP_TO_SINT:
21877   case ISD::FP_TO_UINT: {
21878     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
21879
21880     std::pair<SDValue,SDValue> Vals =
21881         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
21882     SDValue FIST = Vals.first, StackSlot = Vals.second;
21883     if (FIST.getNode()) {
21884       EVT VT = N->getValueType(0);
21885       // Return a load from the stack slot.
21886       if (StackSlot.getNode())
21887         Results.push_back(
21888             DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
21889       else
21890         Results.push_back(FIST);
21891     }
21892     return;
21893   }
21894   case ISD::UINT_TO_FP: {
21895     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21896     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
21897         N->getValueType(0) != MVT::v2f32)
21898       return;
21899     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
21900                                  N->getOperand(0));
21901     SDValue VBias =
21902         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
21903     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
21904                              DAG.getBitcast(MVT::v2i64, VBias));
21905     Or = DAG.getBitcast(MVT::v2f64, Or);
21906     // TODO: Are there any fast-math-flags to propagate here?
21907     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
21908     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
21909     return;
21910   }
21911   case ISD::FP_ROUND: {
21912     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
21913         return;
21914     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
21915     Results.push_back(V);
21916     return;
21917   }
21918   case ISD::FP_EXTEND: {
21919     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
21920     // No other ValueType for FP_EXTEND should reach this point.
21921     assert(N->getValueType(0) == MVT::v2f32 &&
21922            "Do not know how to legalize this Node");
21923     return;
21924   }
21925   case ISD::INTRINSIC_W_CHAIN: {
21926     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
21927     switch (IntNo) {
21928     default : llvm_unreachable("Do not know how to custom type "
21929                                "legalize this intrinsic operation!");
21930     case Intrinsic::x86_rdtsc:
21931       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21932                                      Results);
21933     case Intrinsic::x86_rdtscp:
21934       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
21935                                      Results);
21936     case Intrinsic::x86_rdpmc:
21937       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
21938     }
21939   }
21940   case ISD::INTRINSIC_WO_CHAIN: {
21941     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
21942       Results.push_back(V);
21943     return;
21944   }
21945   case ISD::READCYCLECOUNTER: {
21946     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21947                                    Results);
21948   }
21949   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
21950     EVT T = N->getValueType(0);
21951     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
21952     bool Regs64bit = T == MVT::i128;
21953     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
21954     SDValue cpInL, cpInH;
21955     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21956                         DAG.getConstant(0, dl, HalfT));
21957     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21958                         DAG.getConstant(1, dl, HalfT));
21959     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
21960                              Regs64bit ? X86::RAX : X86::EAX,
21961                              cpInL, SDValue());
21962     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
21963                              Regs64bit ? X86::RDX : X86::EDX,
21964                              cpInH, cpInL.getValue(1));
21965     SDValue swapInL, swapInH;
21966     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21967                           DAG.getConstant(0, dl, HalfT));
21968     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21969                           DAG.getConstant(1, dl, HalfT));
21970     swapInH =
21971         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
21972                          swapInH, cpInH.getValue(1));
21973     // If the current function needs the base pointer, RBX,
21974     // we shouldn't use cmpxchg directly.
21975     // Indeed the lowering of that instruction will clobber
21976     // that register and since RBX will be a reserved register
21977     // the register allocator will not make sure its value will
21978     // be properly saved and restored around this live-range.
21979     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
21980     SDValue Result;
21981     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21982     unsigned BasePtr = TRI->getBaseRegister();
21983     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
21984     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
21985         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
21986       // ISel prefers the LCMPXCHG64 variant.
21987       // If that assert breaks, that means it is not the case anymore,
21988       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
21989       // not just EBX. This is a matter of accepting i64 input for that
21990       // pseudo, and restoring into the register of the right wide
21991       // in expand pseudo. Everything else should just work.
21992       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
21993              "Saving only half of the RBX");
21994       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
21995                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
21996       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
21997                                            Regs64bit ? X86::RBX : X86::EBX,
21998                                            HalfT, swapInH.getValue(1));
21999       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
22000                        RBXSave,
22001                        /*Glue*/ RBXSave.getValue(2)};
22002       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
22003     } else {
22004       unsigned Opcode =
22005           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
22006       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
22007                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
22008                                  swapInH.getValue(1));
22009       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
22010                        swapInL.getValue(1)};
22011       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
22012     }
22013     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
22014                                         Regs64bit ? X86::RAX : X86::EAX,
22015                                         HalfT, Result.getValue(1));
22016     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
22017                                         Regs64bit ? X86::RDX : X86::EDX,
22018                                         HalfT, cpOutL.getValue(2));
22019     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
22020
22021     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
22022                                         MVT::i32, cpOutH.getValue(2));
22023     SDValue Success =
22024         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22025                     DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
22026     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
22027
22028     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
22029     Results.push_back(Success);
22030     Results.push_back(EFLAGS.getValue(1));
22031     return;
22032   }
22033   case ISD::ATOMIC_SWAP:
22034   case ISD::ATOMIC_LOAD_ADD:
22035   case ISD::ATOMIC_LOAD_SUB:
22036   case ISD::ATOMIC_LOAD_AND:
22037   case ISD::ATOMIC_LOAD_OR:
22038   case ISD::ATOMIC_LOAD_XOR:
22039   case ISD::ATOMIC_LOAD_NAND:
22040   case ISD::ATOMIC_LOAD_MIN:
22041   case ISD::ATOMIC_LOAD_MAX:
22042   case ISD::ATOMIC_LOAD_UMIN:
22043   case ISD::ATOMIC_LOAD_UMAX:
22044   case ISD::ATOMIC_LOAD: {
22045     // Delegate to generic TypeLegalization. Situations we can really handle
22046     // should have already been dealt with by AtomicExpandPass.cpp.
22047     break;
22048   }
22049   case ISD::BITCAST: {
22050     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22051     EVT DstVT = N->getValueType(0);
22052     EVT SrcVT = N->getOperand(0)->getValueType(0);
22053
22054     if (SrcVT != MVT::f64 ||
22055         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
22056       return;
22057
22058     unsigned NumElts = DstVT.getVectorNumElements();
22059     EVT SVT = DstVT.getVectorElementType();
22060     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22061     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
22062                                    MVT::v2f64, N->getOperand(0));
22063     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
22064
22065     if (ExperimentalVectorWideningLegalization) {
22066       // If we are legalizing vectors by widening, we already have the desired
22067       // legal vector type, just return it.
22068       Results.push_back(ToVecInt);
22069       return;
22070     }
22071
22072     SmallVector<SDValue, 8> Elts;
22073     for (unsigned i = 0, e = NumElts; i != e; ++i)
22074       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
22075                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
22076
22077     Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
22078   }
22079   }
22080 }
22081
22082 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
22083   switch ((X86ISD::NodeType)Opcode) {
22084   case X86ISD::FIRST_NUMBER:       break;
22085   case X86ISD::BSF:                return "X86ISD::BSF";
22086   case X86ISD::BSR:                return "X86ISD::BSR";
22087   case X86ISD::SHLD:               return "X86ISD::SHLD";
22088   case X86ISD::SHRD:               return "X86ISD::SHRD";
22089   case X86ISD::FAND:               return "X86ISD::FAND";
22090   case X86ISD::FANDN:              return "X86ISD::FANDN";
22091   case X86ISD::FOR:                return "X86ISD::FOR";
22092   case X86ISD::FXOR:               return "X86ISD::FXOR";
22093   case X86ISD::FILD:               return "X86ISD::FILD";
22094   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
22095   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
22096   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
22097   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
22098   case X86ISD::FLD:                return "X86ISD::FLD";
22099   case X86ISD::FST:                return "X86ISD::FST";
22100   case X86ISD::CALL:               return "X86ISD::CALL";
22101   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
22102   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
22103   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
22104   case X86ISD::BT:                 return "X86ISD::BT";
22105   case X86ISD::CMP:                return "X86ISD::CMP";
22106   case X86ISD::COMI:               return "X86ISD::COMI";
22107   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
22108   case X86ISD::CMPM:               return "X86ISD::CMPM";
22109   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
22110   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
22111   case X86ISD::SETCC:              return "X86ISD::SETCC";
22112   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
22113   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
22114   case X86ISD::CMOV:               return "X86ISD::CMOV";
22115   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
22116   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
22117   case X86ISD::IRET:               return "X86ISD::IRET";
22118   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
22119   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
22120   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
22121   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
22122   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
22123   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
22124   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
22125   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
22126   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
22127   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
22128   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
22129   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
22130   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
22131   case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
22132   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
22133   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
22134   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
22135   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
22136   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
22137   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
22138   case X86ISD::HADD:               return "X86ISD::HADD";
22139   case X86ISD::HSUB:               return "X86ISD::HSUB";
22140   case X86ISD::FHADD:              return "X86ISD::FHADD";
22141   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
22142   case X86ISD::ABS:                return "X86ISD::ABS";
22143   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
22144   case X86ISD::FMAX:               return "X86ISD::FMAX";
22145   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
22146   case X86ISD::FMIN:               return "X86ISD::FMIN";
22147   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
22148   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
22149   case X86ISD::FMINC:              return "X86ISD::FMINC";
22150   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
22151   case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
22152   case X86ISD::FRCP:               return "X86ISD::FRCP";
22153   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
22154   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
22155   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
22156   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
22157   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
22158   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
22159   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
22160   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
22161   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
22162     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
22163   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
22164   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
22165   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
22166   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
22167   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
22168   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
22169   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
22170   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
22171     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
22172   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
22173     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
22174   case X86ISD::LADD:               return "X86ISD::LADD";
22175   case X86ISD::LSUB:               return "X86ISD::LSUB";
22176   case X86ISD::LOR:                return "X86ISD::LOR";
22177   case X86ISD::LXOR:               return "X86ISD::LXOR";
22178   case X86ISD::LAND:               return "X86ISD::LAND";
22179   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
22180   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
22181   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
22182   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
22183   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
22184   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
22185   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
22186   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
22187   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
22188   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
22189   case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
22190   case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
22191   case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
22192   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
22193   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
22194   case X86ISD::VSHL:               return "X86ISD::VSHL";
22195   case X86ISD::VSRL:               return "X86ISD::VSRL";
22196   case X86ISD::VSRA:               return "X86ISD::VSRA";
22197   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
22198   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
22199   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
22200   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
22201   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
22202   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
22203   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
22204   case X86ISD::CMPP:               return "X86ISD::CMPP";
22205   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
22206   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
22207   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
22208   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
22209   case X86ISD::ADD:                return "X86ISD::ADD";
22210   case X86ISD::SUB:                return "X86ISD::SUB";
22211   case X86ISD::ADC:                return "X86ISD::ADC";
22212   case X86ISD::SBB:                return "X86ISD::SBB";
22213   case X86ISD::SMUL:               return "X86ISD::SMUL";
22214   case X86ISD::UMUL:               return "X86ISD::UMUL";
22215   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
22216   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
22217   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
22218   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
22219   case X86ISD::INC:                return "X86ISD::INC";
22220   case X86ISD::DEC:                return "X86ISD::DEC";
22221   case X86ISD::OR:                 return "X86ISD::OR";
22222   case X86ISD::XOR:                return "X86ISD::XOR";
22223   case X86ISD::AND:                return "X86ISD::AND";
22224   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
22225   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
22226   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
22227   case X86ISD::PTEST:              return "X86ISD::PTEST";
22228   case X86ISD::TESTP:              return "X86ISD::TESTP";
22229   case X86ISD::TESTM:              return "X86ISD::TESTM";
22230   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
22231   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
22232   case X86ISD::KTEST:              return "X86ISD::KTEST";
22233   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
22234   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
22235   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
22236   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
22237   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
22238   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
22239   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
22240   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
22241   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
22242   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
22243   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
22244   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
22245   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
22246   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
22247   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
22248   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
22249   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
22250   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
22251   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
22252   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
22253   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
22254   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
22255   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
22256   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
22257   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
22258   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
22259   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
22260   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
22261   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
22262   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
22263   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
22264   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
22265   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
22266   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
22267   case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
22268   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
22269   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
22270   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
22271   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
22272   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
22273   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
22274   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
22275   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
22276   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
22277   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
22278   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
22279   case X86ISD::SAHF:               return "X86ISD::SAHF";
22280   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
22281   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
22282   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
22283   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
22284   case X86ISD::VPROT:              return "X86ISD::VPROT";
22285   case X86ISD::VPROTI:             return "X86ISD::VPROTI";
22286   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
22287   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
22288   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
22289   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
22290   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
22291   case X86ISD::FMADD:              return "X86ISD::FMADD";
22292   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
22293   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
22294   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
22295   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
22296   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
22297   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
22298   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
22299   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
22300   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
22301   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
22302   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
22303   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
22304   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
22305   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
22306   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
22307   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
22308   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
22309   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
22310   case X86ISD::XTEST:              return "X86ISD::XTEST";
22311   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
22312   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
22313   case X86ISD::SELECT:             return "X86ISD::SELECT";
22314   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
22315   case X86ISD::RCP28:              return "X86ISD::RCP28";
22316   case X86ISD::EXP2:               return "X86ISD::EXP2";
22317   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
22318   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
22319   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
22320   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
22321   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
22322   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
22323   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
22324   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
22325   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
22326   case X86ISD::ADDS:               return "X86ISD::ADDS";
22327   case X86ISD::SUBS:               return "X86ISD::SUBS";
22328   case X86ISD::AVG:                return "X86ISD::AVG";
22329   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
22330   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
22331   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
22332   case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
22333   case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
22334   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
22335   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
22336   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
22337   case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
22338   case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
22339   }
22340   return nullptr;
22341 }
22342
22343 /// Return true if the addressing mode represented by AM is legal for this
22344 /// target, for a load/store of the specified type.
22345 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
22346                                               const AddrMode &AM, Type *Ty,
22347                                               unsigned AS) const {
22348   // X86 supports extremely general addressing modes.
22349   CodeModel::Model M = getTargetMachine().getCodeModel();
22350
22351   // X86 allows a sign-extended 32-bit immediate field as a displacement.
22352   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
22353     return false;
22354
22355   if (AM.BaseGV) {
22356     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
22357
22358     // If a reference to this global requires an extra load, we can't fold it.
22359     if (isGlobalStubReference(GVFlags))
22360       return false;
22361
22362     // If BaseGV requires a register for the PIC base, we cannot also have a
22363     // BaseReg specified.
22364     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
22365       return false;
22366
22367     // If lower 4G is not available, then we must use rip-relative addressing.
22368     if ((M != CodeModel::Small || isPositionIndependent()) &&
22369         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
22370       return false;
22371   }
22372
22373   switch (AM.Scale) {
22374   case 0:
22375   case 1:
22376   case 2:
22377   case 4:
22378   case 8:
22379     // These scales always work.
22380     break;
22381   case 3:
22382   case 5:
22383   case 9:
22384     // These scales are formed with basereg+scalereg.  Only accept if there is
22385     // no basereg yet.
22386     if (AM.HasBaseReg)
22387       return false;
22388     break;
22389   default:  // Other stuff never works.
22390     return false;
22391   }
22392
22393   return true;
22394 }
22395
22396 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
22397   unsigned Bits = Ty->getScalarSizeInBits();
22398
22399   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
22400   // particularly cheaper than those without.
22401   if (Bits == 8)
22402     return false;
22403
22404   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
22405   // variable shifts just as cheap as scalar ones.
22406   if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
22407     return false;
22408
22409   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
22410   // fully general vector.
22411   return true;
22412 }
22413
22414 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
22415   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22416     return false;
22417   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
22418   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
22419   return NumBits1 > NumBits2;
22420 }
22421
22422 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
22423   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22424     return false;
22425
22426   if (!isTypeLegal(EVT::getEVT(Ty1)))
22427     return false;
22428
22429   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
22430
22431   // Assuming the caller doesn't have a zeroext or signext return parameter,
22432   // truncation all the way down to i1 is valid.
22433   return true;
22434 }
22435
22436 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
22437   return isInt<32>(Imm);
22438 }
22439
22440 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
22441   // Can also use sub to handle negated immediates.
22442   return isInt<32>(Imm);
22443 }
22444
22445 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
22446   if (!VT1.isInteger() || !VT2.isInteger())
22447     return false;
22448   unsigned NumBits1 = VT1.getSizeInBits();
22449   unsigned NumBits2 = VT2.getSizeInBits();
22450   return NumBits1 > NumBits2;
22451 }
22452
22453 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
22454   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22455   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
22456 }
22457
22458 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
22459   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22460   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
22461 }
22462
22463 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
22464   EVT VT1 = Val.getValueType();
22465   if (isZExtFree(VT1, VT2))
22466     return true;
22467
22468   if (Val.getOpcode() != ISD::LOAD)
22469     return false;
22470
22471   if (!VT1.isSimple() || !VT1.isInteger() ||
22472       !VT2.isSimple() || !VT2.isInteger())
22473     return false;
22474
22475   switch (VT1.getSimpleVT().SimpleTy) {
22476   default: break;
22477   case MVT::i8:
22478   case MVT::i16:
22479   case MVT::i32:
22480     // X86 has 8, 16, and 32-bit zero-extending loads.
22481     return true;
22482   }
22483
22484   return false;
22485 }
22486
22487 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
22488
22489 bool
22490 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
22491   if (!Subtarget.hasAnyFMA())
22492     return false;
22493
22494   VT = VT.getScalarType();
22495
22496   if (!VT.isSimple())
22497     return false;
22498
22499   switch (VT.getSimpleVT().SimpleTy) {
22500   case MVT::f32:
22501   case MVT::f64:
22502     return true;
22503   default:
22504     break;
22505   }
22506
22507   return false;
22508 }
22509
22510 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
22511   // i16 instructions are longer (0x66 prefix) and potentially slower.
22512   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
22513 }
22514
22515 /// Targets can use this to indicate that they only support *some*
22516 /// VECTOR_SHUFFLE operations, those with specific masks.
22517 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
22518 /// are assumed to be legal.
22519 bool
22520 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
22521                                       EVT VT) const {
22522   if (!VT.isSimple())
22523     return false;
22524
22525   // Not for i1 vectors
22526   if (VT.getSimpleVT().getScalarType() == MVT::i1)
22527     return false;
22528
22529   // Very little shuffling can be done for 64-bit vectors right now.
22530   if (VT.getSimpleVT().getSizeInBits() == 64)
22531     return false;
22532
22533   // We only care that the types being shuffled are legal. The lowering can
22534   // handle any possible shuffle mask that results.
22535   return isTypeLegal(VT.getSimpleVT());
22536 }
22537
22538 bool
22539 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
22540                                           EVT VT) const {
22541   // Just delegate to the generic legality, clear masks aren't special.
22542   return isShuffleMaskLegal(Mask, VT);
22543 }
22544
22545 //===----------------------------------------------------------------------===//
22546 //                           X86 Scheduler Hooks
22547 //===----------------------------------------------------------------------===//
22548
22549 /// Utility function to emit xbegin specifying the start of an RTM region.
22550 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
22551                                      const TargetInstrInfo *TII) {
22552   DebugLoc DL = MI.getDebugLoc();
22553
22554   const BasicBlock *BB = MBB->getBasicBlock();
22555   MachineFunction::iterator I = ++MBB->getIterator();
22556
22557   // For the v = xbegin(), we generate
22558   //
22559   // thisMBB:
22560   //  xbegin sinkMBB
22561   //
22562   // mainMBB:
22563   //  eax = -1
22564   //
22565   // sinkMBB:
22566   //  v = eax
22567
22568   MachineBasicBlock *thisMBB = MBB;
22569   MachineFunction *MF = MBB->getParent();
22570   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
22571   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
22572   MF->insert(I, mainMBB);
22573   MF->insert(I, sinkMBB);
22574
22575   // Transfer the remainder of BB and its successor edges to sinkMBB.
22576   sinkMBB->splice(sinkMBB->begin(), MBB,
22577                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
22578   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
22579
22580   // thisMBB:
22581   //  xbegin sinkMBB
22582   //  # fallthrough to mainMBB
22583   //  # abortion to sinkMBB
22584   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
22585   thisMBB->addSuccessor(mainMBB);
22586   thisMBB->addSuccessor(sinkMBB);
22587
22588   // mainMBB:
22589   //  EAX = -1
22590   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
22591   mainMBB->addSuccessor(sinkMBB);
22592
22593   // sinkMBB:
22594   // EAX is live into the sinkMBB
22595   sinkMBB->addLiveIn(X86::EAX);
22596   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
22597           MI.getOperand(0).getReg())
22598       .addReg(X86::EAX);
22599
22600   MI.eraseFromParent();
22601   return sinkMBB;
22602 }
22603
22604 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
22605 // or XMM0_V32I8 in AVX all of this code can be replaced with that
22606 // in the .td file.
22607 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
22608                                        const TargetInstrInfo *TII) {
22609   unsigned Opc;
22610   switch (MI.getOpcode()) {
22611   default: llvm_unreachable("illegal opcode!");
22612   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
22613   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
22614   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
22615   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
22616   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
22617   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
22618   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
22619   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
22620   }
22621
22622   DebugLoc dl = MI.getDebugLoc();
22623   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22624
22625   unsigned NumArgs = MI.getNumOperands();
22626   for (unsigned i = 1; i < NumArgs; ++i) {
22627     MachineOperand &Op = MI.getOperand(i);
22628     if (!(Op.isReg() && Op.isImplicit()))
22629       MIB.addOperand(Op);
22630   }
22631   if (MI.hasOneMemOperand())
22632     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22633
22634   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22635       .addReg(X86::XMM0);
22636
22637   MI.eraseFromParent();
22638   return BB;
22639 }
22640
22641 // FIXME: Custom handling because TableGen doesn't support multiple implicit
22642 // defs in an instruction pattern
22643 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
22644                                        const TargetInstrInfo *TII) {
22645   unsigned Opc;
22646   switch (MI.getOpcode()) {
22647   default: llvm_unreachable("illegal opcode!");
22648   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
22649   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
22650   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
22651   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
22652   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
22653   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
22654   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
22655   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
22656   }
22657
22658   DebugLoc dl = MI.getDebugLoc();
22659   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22660
22661   unsigned NumArgs = MI.getNumOperands(); // remove the results
22662   for (unsigned i = 1; i < NumArgs; ++i) {
22663     MachineOperand &Op = MI.getOperand(i);
22664     if (!(Op.isReg() && Op.isImplicit()))
22665       MIB.addOperand(Op);
22666   }
22667   if (MI.hasOneMemOperand())
22668     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22669
22670   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22671       .addReg(X86::ECX);
22672
22673   MI.eraseFromParent();
22674   return BB;
22675 }
22676
22677 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22678                                      const X86Subtarget &Subtarget) {
22679   DebugLoc dl = MI.getDebugLoc();
22680   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22681
22682   // insert input VAL into EAX
22683   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
22684       .addReg(MI.getOperand(0).getReg());
22685   // insert zero to ECX
22686   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22687
22688   // insert zero to EDX
22689   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
22690
22691   // insert WRPKRU instruction
22692   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
22693
22694   MI.eraseFromParent(); // The pseudo is gone now.
22695   return BB;
22696 }
22697
22698 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22699                                      const X86Subtarget &Subtarget) {
22700   DebugLoc dl = MI.getDebugLoc();
22701   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22702
22703   // insert zero to ECX
22704   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22705
22706   // insert RDPKRU instruction
22707   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
22708   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22709       .addReg(X86::EAX);
22710
22711   MI.eraseFromParent(); // The pseudo is gone now.
22712   return BB;
22713 }
22714
22715 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
22716                                       const X86Subtarget &Subtarget,
22717                                       unsigned Opc) {
22718   DebugLoc dl = MI.getDebugLoc();
22719   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22720   // Address into RAX/EAX, other two args into ECX, EDX.
22721   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
22722   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
22723   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
22724   for (int i = 0; i < X86::AddrNumOperands; ++i)
22725     MIB.addOperand(MI.getOperand(i));
22726
22727   unsigned ValOps = X86::AddrNumOperands;
22728   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
22729       .addReg(MI.getOperand(ValOps).getReg());
22730   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
22731       .addReg(MI.getOperand(ValOps + 1).getReg());
22732
22733   // The instruction doesn't actually take any operands though.
22734   BuildMI(*BB, MI, dl, TII->get(Opc));
22735
22736   MI.eraseFromParent(); // The pseudo is gone now.
22737   return BB;
22738 }
22739
22740 MachineBasicBlock *
22741 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
22742                                                  MachineBasicBlock *MBB) const {
22743   // Emit va_arg instruction on X86-64.
22744
22745   // Operands to this pseudo-instruction:
22746   // 0  ) Output        : destination address (reg)
22747   // 1-5) Input         : va_list address (addr, i64mem)
22748   // 6  ) ArgSize       : Size (in bytes) of vararg type
22749   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
22750   // 8  ) Align         : Alignment of type
22751   // 9  ) EFLAGS (implicit-def)
22752
22753   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
22754   static_assert(X86::AddrNumOperands == 5,
22755                 "VAARG_64 assumes 5 address operands");
22756
22757   unsigned DestReg = MI.getOperand(0).getReg();
22758   MachineOperand &Base = MI.getOperand(1);
22759   MachineOperand &Scale = MI.getOperand(2);
22760   MachineOperand &Index = MI.getOperand(3);
22761   MachineOperand &Disp = MI.getOperand(4);
22762   MachineOperand &Segment = MI.getOperand(5);
22763   unsigned ArgSize = MI.getOperand(6).getImm();
22764   unsigned ArgMode = MI.getOperand(7).getImm();
22765   unsigned Align = MI.getOperand(8).getImm();
22766
22767   // Memory Reference
22768   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
22769   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
22770   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
22771
22772   // Machine Information
22773   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22774   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
22775   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
22776   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
22777   DebugLoc DL = MI.getDebugLoc();
22778
22779   // struct va_list {
22780   //   i32   gp_offset
22781   //   i32   fp_offset
22782   //   i64   overflow_area (address)
22783   //   i64   reg_save_area (address)
22784   // }
22785   // sizeof(va_list) = 24
22786   // alignment(va_list) = 8
22787
22788   unsigned TotalNumIntRegs = 6;
22789   unsigned TotalNumXMMRegs = 8;
22790   bool UseGPOffset = (ArgMode == 1);
22791   bool UseFPOffset = (ArgMode == 2);
22792   unsigned MaxOffset = TotalNumIntRegs * 8 +
22793                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
22794
22795   /* Align ArgSize to a multiple of 8 */
22796   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
22797   bool NeedsAlign = (Align > 8);
22798
22799   MachineBasicBlock *thisMBB = MBB;
22800   MachineBasicBlock *overflowMBB;
22801   MachineBasicBlock *offsetMBB;
22802   MachineBasicBlock *endMBB;
22803
22804   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
22805   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
22806   unsigned OffsetReg = 0;
22807
22808   if (!UseGPOffset && !UseFPOffset) {
22809     // If we only pull from the overflow region, we don't create a branch.
22810     // We don't need to alter control flow.
22811     OffsetDestReg = 0; // unused
22812     OverflowDestReg = DestReg;
22813
22814     offsetMBB = nullptr;
22815     overflowMBB = thisMBB;
22816     endMBB = thisMBB;
22817   } else {
22818     // First emit code to check if gp_offset (or fp_offset) is below the bound.
22819     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
22820     // If not, pull from overflow_area. (branch to overflowMBB)
22821     //
22822     //       thisMBB
22823     //         |     .
22824     //         |        .
22825     //     offsetMBB   overflowMBB
22826     //         |        .
22827     //         |     .
22828     //        endMBB
22829
22830     // Registers for the PHI in endMBB
22831     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
22832     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
22833
22834     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
22835     MachineFunction *MF = MBB->getParent();
22836     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22837     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22838     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22839
22840     MachineFunction::iterator MBBIter = ++MBB->getIterator();
22841
22842     // Insert the new basic blocks
22843     MF->insert(MBBIter, offsetMBB);
22844     MF->insert(MBBIter, overflowMBB);
22845     MF->insert(MBBIter, endMBB);
22846
22847     // Transfer the remainder of MBB and its successor edges to endMBB.
22848     endMBB->splice(endMBB->begin(), thisMBB,
22849                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
22850     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
22851
22852     // Make offsetMBB and overflowMBB successors of thisMBB
22853     thisMBB->addSuccessor(offsetMBB);
22854     thisMBB->addSuccessor(overflowMBB);
22855
22856     // endMBB is a successor of both offsetMBB and overflowMBB
22857     offsetMBB->addSuccessor(endMBB);
22858     overflowMBB->addSuccessor(endMBB);
22859
22860     // Load the offset value into a register
22861     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22862     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
22863       .addOperand(Base)
22864       .addOperand(Scale)
22865       .addOperand(Index)
22866       .addDisp(Disp, UseFPOffset ? 4 : 0)
22867       .addOperand(Segment)
22868       .setMemRefs(MMOBegin, MMOEnd);
22869
22870     // Check if there is enough room left to pull this argument.
22871     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
22872       .addReg(OffsetReg)
22873       .addImm(MaxOffset + 8 - ArgSizeA8);
22874
22875     // Branch to "overflowMBB" if offset >= max
22876     // Fall through to "offsetMBB" otherwise
22877     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
22878       .addMBB(overflowMBB);
22879   }
22880
22881   // In offsetMBB, emit code to use the reg_save_area.
22882   if (offsetMBB) {
22883     assert(OffsetReg != 0);
22884
22885     // Read the reg_save_area address.
22886     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
22887     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
22888       .addOperand(Base)
22889       .addOperand(Scale)
22890       .addOperand(Index)
22891       .addDisp(Disp, 16)
22892       .addOperand(Segment)
22893       .setMemRefs(MMOBegin, MMOEnd);
22894
22895     // Zero-extend the offset
22896     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
22897       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
22898         .addImm(0)
22899         .addReg(OffsetReg)
22900         .addImm(X86::sub_32bit);
22901
22902     // Add the offset to the reg_save_area to get the final address.
22903     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
22904       .addReg(OffsetReg64)
22905       .addReg(RegSaveReg);
22906
22907     // Compute the offset for the next argument
22908     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22909     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
22910       .addReg(OffsetReg)
22911       .addImm(UseFPOffset ? 16 : 8);
22912
22913     // Store it back into the va_list.
22914     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
22915       .addOperand(Base)
22916       .addOperand(Scale)
22917       .addOperand(Index)
22918       .addDisp(Disp, UseFPOffset ? 4 : 0)
22919       .addOperand(Segment)
22920       .addReg(NextOffsetReg)
22921       .setMemRefs(MMOBegin, MMOEnd);
22922
22923     // Jump to endMBB
22924     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
22925       .addMBB(endMBB);
22926   }
22927
22928   //
22929   // Emit code to use overflow area
22930   //
22931
22932   // Load the overflow_area address into a register.
22933   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
22934   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
22935     .addOperand(Base)
22936     .addOperand(Scale)
22937     .addOperand(Index)
22938     .addDisp(Disp, 8)
22939     .addOperand(Segment)
22940     .setMemRefs(MMOBegin, MMOEnd);
22941
22942   // If we need to align it, do so. Otherwise, just copy the address
22943   // to OverflowDestReg.
22944   if (NeedsAlign) {
22945     // Align the overflow address
22946     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
22947     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
22948
22949     // aligned_addr = (addr + (align-1)) & ~(align-1)
22950     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
22951       .addReg(OverflowAddrReg)
22952       .addImm(Align-1);
22953
22954     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
22955       .addReg(TmpReg)
22956       .addImm(~(uint64_t)(Align-1));
22957   } else {
22958     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
22959       .addReg(OverflowAddrReg);
22960   }
22961
22962   // Compute the next overflow address after this argument.
22963   // (the overflow address should be kept 8-byte aligned)
22964   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
22965   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
22966     .addReg(OverflowDestReg)
22967     .addImm(ArgSizeA8);
22968
22969   // Store the new overflow address.
22970   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
22971     .addOperand(Base)
22972     .addOperand(Scale)
22973     .addOperand(Index)
22974     .addDisp(Disp, 8)
22975     .addOperand(Segment)
22976     .addReg(NextAddrReg)
22977     .setMemRefs(MMOBegin, MMOEnd);
22978
22979   // If we branched, emit the PHI to the front of endMBB.
22980   if (offsetMBB) {
22981     BuildMI(*endMBB, endMBB->begin(), DL,
22982             TII->get(X86::PHI), DestReg)
22983       .addReg(OffsetDestReg).addMBB(offsetMBB)
22984       .addReg(OverflowDestReg).addMBB(overflowMBB);
22985   }
22986
22987   // Erase the pseudo instruction
22988   MI.eraseFromParent();
22989
22990   return endMBB;
22991 }
22992
22993 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
22994     MachineInstr &MI, MachineBasicBlock *MBB) const {
22995   // Emit code to save XMM registers to the stack. The ABI says that the
22996   // number of registers to save is given in %al, so it's theoretically
22997   // possible to do an indirect jump trick to avoid saving all of them,
22998   // however this code takes a simpler approach and just executes all
22999   // of the stores if %al is non-zero. It's less code, and it's probably
23000   // easier on the hardware branch predictor, and stores aren't all that
23001   // expensive anyway.
23002
23003   // Create the new basic blocks. One block contains all the XMM stores,
23004   // and one block is the final destination regardless of whether any
23005   // stores were performed.
23006   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
23007   MachineFunction *F = MBB->getParent();
23008   MachineFunction::iterator MBBIter = ++MBB->getIterator();
23009   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
23010   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
23011   F->insert(MBBIter, XMMSaveMBB);
23012   F->insert(MBBIter, EndMBB);
23013
23014   // Transfer the remainder of MBB and its successor edges to EndMBB.
23015   EndMBB->splice(EndMBB->begin(), MBB,
23016                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23017   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
23018
23019   // The original block will now fall through to the XMM save block.
23020   MBB->addSuccessor(XMMSaveMBB);
23021   // The XMMSaveMBB will fall through to the end block.
23022   XMMSaveMBB->addSuccessor(EndMBB);
23023
23024   // Now add the instructions.
23025   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23026   DebugLoc DL = MI.getDebugLoc();
23027
23028   unsigned CountReg = MI.getOperand(0).getReg();
23029   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
23030   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
23031
23032   if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
23033     // If %al is 0, branch around the XMM save block.
23034     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
23035     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
23036     MBB->addSuccessor(EndMBB);
23037   }
23038
23039   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
23040   // that was just emitted, but clearly shouldn't be "saved".
23041   assert((MI.getNumOperands() <= 3 ||
23042           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
23043           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
23044          "Expected last argument to be EFLAGS");
23045   unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
23046   // In the XMM save block, save all the XMM argument registers.
23047   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
23048     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
23049     MachineMemOperand *MMO = F->getMachineMemOperand(
23050         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
23051         MachineMemOperand::MOStore,
23052         /*Size=*/16, /*Align=*/16);
23053     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
23054         .addFrameIndex(RegSaveFrameIndex)
23055         .addImm(/*Scale=*/1)
23056         .addReg(/*IndexReg=*/0)
23057         .addImm(/*Disp=*/Offset)
23058         .addReg(/*Segment=*/0)
23059         .addReg(MI.getOperand(i).getReg())
23060         .addMemOperand(MMO);
23061   }
23062
23063   MI.eraseFromParent(); // The pseudo instruction is gone now.
23064
23065   return EndMBB;
23066 }
23067
23068 // The EFLAGS operand of SelectItr might be missing a kill marker
23069 // because there were multiple uses of EFLAGS, and ISel didn't know
23070 // which to mark. Figure out whether SelectItr should have had a
23071 // kill marker, and set it if it should. Returns the correct kill
23072 // marker value.
23073 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
23074                                      MachineBasicBlock* BB,
23075                                      const TargetRegisterInfo* TRI) {
23076   // Scan forward through BB for a use/def of EFLAGS.
23077   MachineBasicBlock::iterator miI(std::next(SelectItr));
23078   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
23079     const MachineInstr& mi = *miI;
23080     if (mi.readsRegister(X86::EFLAGS))
23081       return false;
23082     if (mi.definesRegister(X86::EFLAGS))
23083       break; // Should have kill-flag - update below.
23084   }
23085
23086   // If we hit the end of the block, check whether EFLAGS is live into a
23087   // successor.
23088   if (miI == BB->end()) {
23089     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
23090                                           sEnd = BB->succ_end();
23091          sItr != sEnd; ++sItr) {
23092       MachineBasicBlock* succ = *sItr;
23093       if (succ->isLiveIn(X86::EFLAGS))
23094         return false;
23095     }
23096   }
23097
23098   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
23099   // out. SelectMI should have a kill flag on EFLAGS.
23100   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
23101   return true;
23102 }
23103
23104 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
23105 // together with other CMOV pseudo-opcodes into a single basic-block with
23106 // conditional jump around it.
23107 static bool isCMOVPseudo(MachineInstr &MI) {
23108   switch (MI.getOpcode()) {
23109   case X86::CMOV_FR32:
23110   case X86::CMOV_FR64:
23111   case X86::CMOV_GR8:
23112   case X86::CMOV_GR16:
23113   case X86::CMOV_GR32:
23114   case X86::CMOV_RFP32:
23115   case X86::CMOV_RFP64:
23116   case X86::CMOV_RFP80:
23117   case X86::CMOV_V2F64:
23118   case X86::CMOV_V2I64:
23119   case X86::CMOV_V4F32:
23120   case X86::CMOV_V4F64:
23121   case X86::CMOV_V4I64:
23122   case X86::CMOV_V16F32:
23123   case X86::CMOV_V8F32:
23124   case X86::CMOV_V8F64:
23125   case X86::CMOV_V8I64:
23126   case X86::CMOV_V8I1:
23127   case X86::CMOV_V16I1:
23128   case X86::CMOV_V32I1:
23129   case X86::CMOV_V64I1:
23130     return true;
23131
23132   default:
23133     return false;
23134   }
23135 }
23136
23137 MachineBasicBlock *
23138 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
23139                                      MachineBasicBlock *BB) const {
23140   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23141   DebugLoc DL = MI.getDebugLoc();
23142
23143   // To "insert" a SELECT_CC instruction, we actually have to insert the
23144   // diamond control-flow pattern.  The incoming instruction knows the
23145   // destination vreg to set, the condition code register to branch on, the
23146   // true/false values to select between, and a branch opcode to use.
23147   const BasicBlock *LLVM_BB = BB->getBasicBlock();
23148   MachineFunction::iterator It = ++BB->getIterator();
23149
23150   //  thisMBB:
23151   //  ...
23152   //   TrueVal = ...
23153   //   cmpTY ccX, r1, r2
23154   //   bCC copy1MBB
23155   //   fallthrough --> copy0MBB
23156   MachineBasicBlock *thisMBB = BB;
23157   MachineFunction *F = BB->getParent();
23158
23159   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
23160   // as described above, by inserting a BB, and then making a PHI at the join
23161   // point to select the true and false operands of the CMOV in the PHI.
23162   //
23163   // The code also handles two different cases of multiple CMOV opcodes
23164   // in a row.
23165   //
23166   // Case 1:
23167   // In this case, there are multiple CMOVs in a row, all which are based on
23168   // the same condition setting (or the exact opposite condition setting).
23169   // In this case we can lower all the CMOVs using a single inserted BB, and
23170   // then make a number of PHIs at the join point to model the CMOVs. The only
23171   // trickiness here, is that in a case like:
23172   //
23173   // t2 = CMOV cond1 t1, f1
23174   // t3 = CMOV cond1 t2, f2
23175   //
23176   // when rewriting this into PHIs, we have to perform some renaming on the
23177   // temps since you cannot have a PHI operand refer to a PHI result earlier
23178   // in the same block.  The "simple" but wrong lowering would be:
23179   //
23180   // t2 = PHI t1(BB1), f1(BB2)
23181   // t3 = PHI t2(BB1), f2(BB2)
23182   //
23183   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
23184   // renaming is to note that on the path through BB1, t2 is really just a
23185   // copy of t1, and do that renaming, properly generating:
23186   //
23187   // t2 = PHI t1(BB1), f1(BB2)
23188   // t3 = PHI t1(BB1), f2(BB2)
23189   //
23190   // Case 2, we lower cascaded CMOVs such as
23191   //
23192   //   (CMOV (CMOV F, T, cc1), T, cc2)
23193   //
23194   // to two successives branches.  For that, we look for another CMOV as the
23195   // following instruction.
23196   //
23197   // Without this, we would add a PHI between the two jumps, which ends up
23198   // creating a few copies all around. For instance, for
23199   //
23200   //    (sitofp (zext (fcmp une)))
23201   //
23202   // we would generate:
23203   //
23204   //         ucomiss %xmm1, %xmm0
23205   //         movss  <1.0f>, %xmm0
23206   //         movaps  %xmm0, %xmm1
23207   //         jne     .LBB5_2
23208   //         xorps   %xmm1, %xmm1
23209   // .LBB5_2:
23210   //         jp      .LBB5_4
23211   //         movaps  %xmm1, %xmm0
23212   // .LBB5_4:
23213   //         retq
23214   //
23215   // because this custom-inserter would have generated:
23216   //
23217   //   A
23218   //   | \
23219   //   |  B
23220   //   | /
23221   //   C
23222   //   | \
23223   //   |  D
23224   //   | /
23225   //   E
23226   //
23227   // A: X = ...; Y = ...
23228   // B: empty
23229   // C: Z = PHI [X, A], [Y, B]
23230   // D: empty
23231   // E: PHI [X, C], [Z, D]
23232   //
23233   // If we lower both CMOVs in a single step, we can instead generate:
23234   //
23235   //   A
23236   //   | \
23237   //   |  C
23238   //   | /|
23239   //   |/ |
23240   //   |  |
23241   //   |  D
23242   //   | /
23243   //   E
23244   //
23245   // A: X = ...; Y = ...
23246   // D: empty
23247   // E: PHI [X, A], [X, C], [Y, D]
23248   //
23249   // Which, in our sitofp/fcmp example, gives us something like:
23250   //
23251   //         ucomiss %xmm1, %xmm0
23252   //         movss  <1.0f>, %xmm0
23253   //         jne     .LBB5_4
23254   //         jp      .LBB5_4
23255   //         xorps   %xmm0, %xmm0
23256   // .LBB5_4:
23257   //         retq
23258   //
23259   MachineInstr *CascadedCMOV = nullptr;
23260   MachineInstr *LastCMOV = &MI;
23261   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
23262   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
23263   MachineBasicBlock::iterator NextMIIt =
23264       std::next(MachineBasicBlock::iterator(MI));
23265
23266   // Check for case 1, where there are multiple CMOVs with the same condition
23267   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
23268   // number of jumps the most.
23269
23270   if (isCMOVPseudo(MI)) {
23271     // See if we have a string of CMOVS with the same condition.
23272     while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
23273            (NextMIIt->getOperand(3).getImm() == CC ||
23274             NextMIIt->getOperand(3).getImm() == OppCC)) {
23275       LastCMOV = &*NextMIIt;
23276       ++NextMIIt;
23277     }
23278   }
23279
23280   // This checks for case 2, but only do this if we didn't already find
23281   // case 1, as indicated by LastCMOV == MI.
23282   if (LastCMOV == &MI && NextMIIt != BB->end() &&
23283       NextMIIt->getOpcode() == MI.getOpcode() &&
23284       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
23285       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
23286       NextMIIt->getOperand(1).isKill()) {
23287     CascadedCMOV = &*NextMIIt;
23288   }
23289
23290   MachineBasicBlock *jcc1MBB = nullptr;
23291
23292   // If we have a cascaded CMOV, we lower it to two successive branches to
23293   // the same block.  EFLAGS is used by both, so mark it as live in the second.
23294   if (CascadedCMOV) {
23295     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
23296     F->insert(It, jcc1MBB);
23297     jcc1MBB->addLiveIn(X86::EFLAGS);
23298   }
23299
23300   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
23301   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
23302   F->insert(It, copy0MBB);
23303   F->insert(It, sinkMBB);
23304
23305   // If the EFLAGS register isn't dead in the terminator, then claim that it's
23306   // live into the sink and copy blocks.
23307   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
23308
23309   MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
23310   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
23311       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
23312     copy0MBB->addLiveIn(X86::EFLAGS);
23313     sinkMBB->addLiveIn(X86::EFLAGS);
23314   }
23315
23316   // Transfer the remainder of BB and its successor edges to sinkMBB.
23317   sinkMBB->splice(sinkMBB->begin(), BB,
23318                   std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
23319   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
23320
23321   // Add the true and fallthrough blocks as its successors.
23322   if (CascadedCMOV) {
23323     // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
23324     BB->addSuccessor(jcc1MBB);
23325
23326     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
23327     // jump to the sinkMBB.
23328     jcc1MBB->addSuccessor(copy0MBB);
23329     jcc1MBB->addSuccessor(sinkMBB);
23330   } else {
23331     BB->addSuccessor(copy0MBB);
23332   }
23333
23334   // The true block target of the first (or only) branch is always sinkMBB.
23335   BB->addSuccessor(sinkMBB);
23336
23337   // Create the conditional branch instruction.
23338   unsigned Opc = X86::GetCondBranchFromCond(CC);
23339   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
23340
23341   if (CascadedCMOV) {
23342     unsigned Opc2 = X86::GetCondBranchFromCond(
23343         (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
23344     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
23345   }
23346
23347   //  copy0MBB:
23348   //   %FalseValue = ...
23349   //   # fallthrough to sinkMBB
23350   copy0MBB->addSuccessor(sinkMBB);
23351
23352   //  sinkMBB:
23353   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
23354   //  ...
23355   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
23356   MachineBasicBlock::iterator MIItEnd =
23357     std::next(MachineBasicBlock::iterator(LastCMOV));
23358   MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
23359   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
23360   MachineInstrBuilder MIB;
23361
23362   // As we are creating the PHIs, we have to be careful if there is more than
23363   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
23364   // PHIs have to reference the individual true/false inputs from earlier PHIs.
23365   // That also means that PHI construction must work forward from earlier to
23366   // later, and that the code must maintain a mapping from earlier PHI's
23367   // destination registers, and the registers that went into the PHI.
23368
23369   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
23370     unsigned DestReg = MIIt->getOperand(0).getReg();
23371     unsigned Op1Reg = MIIt->getOperand(1).getReg();
23372     unsigned Op2Reg = MIIt->getOperand(2).getReg();
23373
23374     // If this CMOV we are generating is the opposite condition from
23375     // the jump we generated, then we have to swap the operands for the
23376     // PHI that is going to be generated.
23377     if (MIIt->getOperand(3).getImm() == OppCC)
23378         std::swap(Op1Reg, Op2Reg);
23379
23380     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
23381       Op1Reg = RegRewriteTable[Op1Reg].first;
23382
23383     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
23384       Op2Reg = RegRewriteTable[Op2Reg].second;
23385
23386     MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
23387                   TII->get(X86::PHI), DestReg)
23388           .addReg(Op1Reg).addMBB(copy0MBB)
23389           .addReg(Op2Reg).addMBB(thisMBB);
23390
23391     // Add this PHI to the rewrite table.
23392     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
23393   }
23394
23395   // If we have a cascaded CMOV, the second Jcc provides the same incoming
23396   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
23397   if (CascadedCMOV) {
23398     MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
23399     // Copy the PHI result to the register defined by the second CMOV.
23400     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
23401             DL, TII->get(TargetOpcode::COPY),
23402             CascadedCMOV->getOperand(0).getReg())
23403         .addReg(MI.getOperand(0).getReg());
23404     CascadedCMOV->eraseFromParent();
23405   }
23406
23407   // Now remove the CMOV(s).
23408   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
23409     (MIIt++)->eraseFromParent();
23410
23411   return sinkMBB;
23412 }
23413
23414 MachineBasicBlock *
23415 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
23416                                        MachineBasicBlock *BB) const {
23417   // Combine the following atomic floating-point modification pattern:
23418   //   a.store(reg OP a.load(acquire), release)
23419   // Transform them into:
23420   //   OPss (%gpr), %xmm
23421   //   movss %xmm, (%gpr)
23422   // Or sd equivalent for 64-bit operations.
23423   unsigned MOp, FOp;
23424   switch (MI.getOpcode()) {
23425   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
23426   case X86::RELEASE_FADD32mr:
23427     FOp = X86::ADDSSrm;
23428     MOp = X86::MOVSSmr;
23429     break;
23430   case X86::RELEASE_FADD64mr:
23431     FOp = X86::ADDSDrm;
23432     MOp = X86::MOVSDmr;
23433     break;
23434   }
23435   const X86InstrInfo *TII = Subtarget.getInstrInfo();
23436   DebugLoc DL = MI.getDebugLoc();
23437   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
23438   unsigned ValOpIdx = X86::AddrNumOperands;
23439   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
23440   MachineInstrBuilder MIB =
23441       BuildMI(*BB, MI, DL, TII->get(FOp),
23442               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
23443           .addReg(VSrc);
23444   for (int i = 0; i < X86::AddrNumOperands; ++i) {
23445     MachineOperand &Operand = MI.getOperand(i);
23446     // Clear any kill flags on register operands as we'll create a second
23447     // instruction using the same address operands.
23448     if (Operand.isReg())
23449       Operand.setIsKill(false);
23450     MIB.addOperand(Operand);
23451   }
23452   MachineInstr *FOpMI = MIB;
23453   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
23454   for (int i = 0; i < X86::AddrNumOperands; ++i)
23455     MIB.addOperand(MI.getOperand(i));
23456   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
23457   MI.eraseFromParent(); // The pseudo instruction is gone now.
23458   return BB;
23459 }
23460
23461 MachineBasicBlock *
23462 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
23463                                         MachineBasicBlock *BB) const {
23464   MachineFunction *MF = BB->getParent();
23465   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23466   DebugLoc DL = MI.getDebugLoc();
23467   const BasicBlock *LLVM_BB = BB->getBasicBlock();
23468
23469   assert(MF->shouldSplitStack());
23470
23471   const bool Is64Bit = Subtarget.is64Bit();
23472   const bool IsLP64 = Subtarget.isTarget64BitLP64();
23473
23474   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
23475   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
23476
23477   // BB:
23478   //  ... [Till the alloca]
23479   // If stacklet is not large enough, jump to mallocMBB
23480   //
23481   // bumpMBB:
23482   //  Allocate by subtracting from RSP
23483   //  Jump to continueMBB
23484   //
23485   // mallocMBB:
23486   //  Allocate by call to runtime
23487   //
23488   // continueMBB:
23489   //  ...
23490   //  [rest of original BB]
23491   //
23492
23493   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23494   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23495   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23496
23497   MachineRegisterInfo &MRI = MF->getRegInfo();
23498   const TargetRegisterClass *AddrRegClass =
23499       getRegClassFor(getPointerTy(MF->getDataLayout()));
23500
23501   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23502            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23503            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
23504            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
23505            sizeVReg = MI.getOperand(1).getReg(),
23506            physSPReg =
23507                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
23508
23509   MachineFunction::iterator MBBIter = ++BB->getIterator();
23510
23511   MF->insert(MBBIter, bumpMBB);
23512   MF->insert(MBBIter, mallocMBB);
23513   MF->insert(MBBIter, continueMBB);
23514
23515   continueMBB->splice(continueMBB->begin(), BB,
23516                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
23517   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
23518
23519   // Add code to the main basic block to check if the stack limit has been hit,
23520   // and if so, jump to mallocMBB otherwise to bumpMBB.
23521   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
23522   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
23523     .addReg(tmpSPVReg).addReg(sizeVReg);
23524   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
23525     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
23526     .addReg(SPLimitVReg);
23527   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
23528
23529   // bumpMBB simply decreases the stack pointer, since we know the current
23530   // stacklet has enough space.
23531   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
23532     .addReg(SPLimitVReg);
23533   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
23534     .addReg(SPLimitVReg);
23535   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23536
23537   // Calls into a routine in libgcc to allocate more space from the heap.
23538   const uint32_t *RegMask =
23539       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
23540   if (IsLP64) {
23541     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
23542       .addReg(sizeVReg);
23543     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23544       .addExternalSymbol("__morestack_allocate_stack_space")
23545       .addRegMask(RegMask)
23546       .addReg(X86::RDI, RegState::Implicit)
23547       .addReg(X86::RAX, RegState::ImplicitDefine);
23548   } else if (Is64Bit) {
23549     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
23550       .addReg(sizeVReg);
23551     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23552       .addExternalSymbol("__morestack_allocate_stack_space")
23553       .addRegMask(RegMask)
23554       .addReg(X86::EDI, RegState::Implicit)
23555       .addReg(X86::EAX, RegState::ImplicitDefine);
23556   } else {
23557     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
23558       .addImm(12);
23559     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
23560     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
23561       .addExternalSymbol("__morestack_allocate_stack_space")
23562       .addRegMask(RegMask)
23563       .addReg(X86::EAX, RegState::ImplicitDefine);
23564   }
23565
23566   if (!Is64Bit)
23567     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
23568       .addImm(16);
23569
23570   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
23571     .addReg(IsLP64 ? X86::RAX : X86::EAX);
23572   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23573
23574   // Set up the CFG correctly.
23575   BB->addSuccessor(bumpMBB);
23576   BB->addSuccessor(mallocMBB);
23577   mallocMBB->addSuccessor(continueMBB);
23578   bumpMBB->addSuccessor(continueMBB);
23579
23580   // Take care of the PHI nodes.
23581   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
23582           MI.getOperand(0).getReg())
23583       .addReg(mallocPtrVReg)
23584       .addMBB(mallocMBB)
23585       .addReg(bumpSPPtrVReg)
23586       .addMBB(bumpMBB);
23587
23588   // Delete the original pseudo instruction.
23589   MI.eraseFromParent();
23590
23591   // And we're done.
23592   return continueMBB;
23593 }
23594
23595 MachineBasicBlock *
23596 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
23597                                        MachineBasicBlock *BB) const {
23598   MachineFunction *MF = BB->getParent();
23599   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23600   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
23601   DebugLoc DL = MI.getDebugLoc();
23602
23603   assert(!isAsynchronousEHPersonality(
23604              classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
23605          "SEH does not use catchret!");
23606
23607   // Only 32-bit EH needs to worry about manually restoring stack pointers.
23608   if (!Subtarget.is32Bit())
23609     return BB;
23610
23611   // C++ EH creates a new target block to hold the restore code, and wires up
23612   // the new block to the return destination with a normal JMP_4.
23613   MachineBasicBlock *RestoreMBB =
23614       MF->CreateMachineBasicBlock(BB->getBasicBlock());
23615   assert(BB->succ_size() == 1);
23616   MF->insert(std::next(BB->getIterator()), RestoreMBB);
23617   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
23618   BB->addSuccessor(RestoreMBB);
23619   MI.getOperand(0).setMBB(RestoreMBB);
23620
23621   auto RestoreMBBI = RestoreMBB->begin();
23622   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
23623   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
23624   return BB;
23625 }
23626
23627 MachineBasicBlock *
23628 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
23629                                        MachineBasicBlock *BB) const {
23630   MachineFunction *MF = BB->getParent();
23631   const Constant *PerFn = MF->getFunction()->getPersonalityFn();
23632   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
23633   // Only 32-bit SEH requires special handling for catchpad.
23634   if (IsSEH && Subtarget.is32Bit()) {
23635     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23636     DebugLoc DL = MI.getDebugLoc();
23637     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
23638   }
23639   MI.eraseFromParent();
23640   return BB;
23641 }
23642
23643 MachineBasicBlock *
23644 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
23645                                       MachineBasicBlock *BB) const {
23646   // So, here we replace TLSADDR with the sequence:
23647   // adjust_stackdown -> TLSADDR -> adjust_stackup.
23648   // We need this because TLSADDR is lowered into calls
23649   // inside MC, therefore without the two markers shrink-wrapping
23650   // may push the prologue/epilogue pass them.
23651   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23652   DebugLoc DL = MI.getDebugLoc();
23653   MachineFunction &MF = *BB->getParent();
23654
23655   // Emit CALLSEQ_START right before the instruction.
23656   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
23657   MachineInstrBuilder CallseqStart =
23658     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
23659   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
23660
23661   // Emit CALLSEQ_END right after the instruction.
23662   // We don't call erase from parent because we want to keep the
23663   // original instruction around.
23664   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
23665   MachineInstrBuilder CallseqEnd =
23666     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
23667   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
23668
23669   return BB;
23670 }
23671
23672 MachineBasicBlock *
23673 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
23674                                       MachineBasicBlock *BB) const {
23675   // This is pretty easy.  We're taking the value that we received from
23676   // our load from the relocation, sticking it in either RDI (x86-64)
23677   // or EAX and doing an indirect call.  The return value will then
23678   // be in the normal return register.
23679   MachineFunction *F = BB->getParent();
23680   const X86InstrInfo *TII = Subtarget.getInstrInfo();
23681   DebugLoc DL = MI.getDebugLoc();
23682
23683   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
23684   assert(MI.getOperand(3).isGlobal() && "This should be a global");
23685
23686   // Get a register mask for the lowered call.
23687   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
23688   // proper register mask.
23689   const uint32_t *RegMask =
23690       Subtarget.is64Bit() ?
23691       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
23692       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
23693   if (Subtarget.is64Bit()) {
23694     MachineInstrBuilder MIB =
23695         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
23696             .addReg(X86::RIP)
23697             .addImm(0)
23698             .addReg(0)
23699             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23700                               MI.getOperand(3).getTargetFlags())
23701             .addReg(0);
23702     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
23703     addDirectMem(MIB, X86::RDI);
23704     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
23705   } else if (!isPositionIndependent()) {
23706     MachineInstrBuilder MIB =
23707         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23708             .addReg(0)
23709             .addImm(0)
23710             .addReg(0)
23711             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23712                               MI.getOperand(3).getTargetFlags())
23713             .addReg(0);
23714     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23715     addDirectMem(MIB, X86::EAX);
23716     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23717   } else {
23718     MachineInstrBuilder MIB =
23719         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23720             .addReg(TII->getGlobalBaseReg(F))
23721             .addImm(0)
23722             .addReg(0)
23723             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23724                               MI.getOperand(3).getTargetFlags())
23725             .addReg(0);
23726     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23727     addDirectMem(MIB, X86::EAX);
23728     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23729   }
23730
23731   MI.eraseFromParent(); // The pseudo instruction is gone now.
23732   return BB;
23733 }
23734
23735 MachineBasicBlock *
23736 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
23737                                     MachineBasicBlock *MBB) const {
23738   DebugLoc DL = MI.getDebugLoc();
23739   MachineFunction *MF = MBB->getParent();
23740   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23741   MachineRegisterInfo &MRI = MF->getRegInfo();
23742
23743   const BasicBlock *BB = MBB->getBasicBlock();
23744   MachineFunction::iterator I = ++MBB->getIterator();
23745
23746   // Memory Reference
23747   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23748   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23749
23750   unsigned DstReg;
23751   unsigned MemOpndSlot = 0;
23752
23753   unsigned CurOp = 0;
23754
23755   DstReg = MI.getOperand(CurOp++).getReg();
23756   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
23757   assert(RC->hasType(MVT::i32) && "Invalid destination!");
23758   unsigned mainDstReg = MRI.createVirtualRegister(RC);
23759   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
23760
23761   MemOpndSlot = CurOp;
23762
23763   MVT PVT = getPointerTy(MF->getDataLayout());
23764   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23765          "Invalid Pointer Size!");
23766
23767   // For v = setjmp(buf), we generate
23768   //
23769   // thisMBB:
23770   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
23771   //  SjLjSetup restoreMBB
23772   //
23773   // mainMBB:
23774   //  v_main = 0
23775   //
23776   // sinkMBB:
23777   //  v = phi(main, restore)
23778   //
23779   // restoreMBB:
23780   //  if base pointer being used, load it from frame
23781   //  v_restore = 1
23782
23783   MachineBasicBlock *thisMBB = MBB;
23784   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
23785   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
23786   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
23787   MF->insert(I, mainMBB);
23788   MF->insert(I, sinkMBB);
23789   MF->push_back(restoreMBB);
23790   restoreMBB->setHasAddressTaken();
23791
23792   MachineInstrBuilder MIB;
23793
23794   // Transfer the remainder of BB and its successor edges to sinkMBB.
23795   sinkMBB->splice(sinkMBB->begin(), MBB,
23796                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23797   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
23798
23799   // thisMBB:
23800   unsigned PtrStoreOpc = 0;
23801   unsigned LabelReg = 0;
23802   const int64_t LabelOffset = 1 * PVT.getStoreSize();
23803   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23804                      !isPositionIndependent();
23805
23806   // Prepare IP either in reg or imm.
23807   if (!UseImmLabel) {
23808     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23809     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
23810     LabelReg = MRI.createVirtualRegister(PtrRC);
23811     if (Subtarget.is64Bit()) {
23812       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
23813               .addReg(X86::RIP)
23814               .addImm(0)
23815               .addReg(0)
23816               .addMBB(restoreMBB)
23817               .addReg(0);
23818     } else {
23819       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
23820       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
23821               .addReg(XII->getGlobalBaseReg(MF))
23822               .addImm(0)
23823               .addReg(0)
23824               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
23825               .addReg(0);
23826     }
23827   } else
23828     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23829   // Store IP
23830   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
23831   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23832     if (i == X86::AddrDisp)
23833       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
23834     else
23835       MIB.addOperand(MI.getOperand(MemOpndSlot + i));
23836   }
23837   if (!UseImmLabel)
23838     MIB.addReg(LabelReg);
23839   else
23840     MIB.addMBB(restoreMBB);
23841   MIB.setMemRefs(MMOBegin, MMOEnd);
23842   // Setup
23843   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
23844           .addMBB(restoreMBB);
23845
23846   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23847   MIB.addRegMask(RegInfo->getNoPreservedMask());
23848   thisMBB->addSuccessor(mainMBB);
23849   thisMBB->addSuccessor(restoreMBB);
23850
23851   // mainMBB:
23852   //  EAX = 0
23853   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
23854   mainMBB->addSuccessor(sinkMBB);
23855
23856   // sinkMBB:
23857   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
23858           TII->get(X86::PHI), DstReg)
23859     .addReg(mainDstReg).addMBB(mainMBB)
23860     .addReg(restoreDstReg).addMBB(restoreMBB);
23861
23862   // restoreMBB:
23863   if (RegInfo->hasBasePointer(*MF)) {
23864     const bool Uses64BitFramePtr =
23865         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
23866     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
23867     X86FI->setRestoreBasePointer(MF);
23868     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
23869     unsigned BasePtr = RegInfo->getBaseRegister();
23870     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
23871     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
23872                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
23873       .setMIFlag(MachineInstr::FrameSetup);
23874   }
23875   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
23876   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
23877   restoreMBB->addSuccessor(sinkMBB);
23878
23879   MI.eraseFromParent();
23880   return sinkMBB;
23881 }
23882
23883 MachineBasicBlock *
23884 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
23885                                      MachineBasicBlock *MBB) const {
23886   DebugLoc DL = MI.getDebugLoc();
23887   MachineFunction *MF = MBB->getParent();
23888   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23889   MachineRegisterInfo &MRI = MF->getRegInfo();
23890
23891   // Memory Reference
23892   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23893   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23894
23895   MVT PVT = getPointerTy(MF->getDataLayout());
23896   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23897          "Invalid Pointer Size!");
23898
23899   const TargetRegisterClass *RC =
23900     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23901   unsigned Tmp = MRI.createVirtualRegister(RC);
23902   // Since FP is only updated here but NOT referenced, it's treated as GPR.
23903   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23904   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
23905   unsigned SP = RegInfo->getStackRegister();
23906
23907   MachineInstrBuilder MIB;
23908
23909   const int64_t LabelOffset = 1 * PVT.getStoreSize();
23910   const int64_t SPOffset = 2 * PVT.getStoreSize();
23911
23912   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
23913   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
23914
23915   // Reload FP
23916   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
23917   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
23918     MIB.addOperand(MI.getOperand(i));
23919   MIB.setMemRefs(MMOBegin, MMOEnd);
23920   // Reload IP
23921   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
23922   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23923     if (i == X86::AddrDisp)
23924       MIB.addDisp(MI.getOperand(i), LabelOffset);
23925     else
23926       MIB.addOperand(MI.getOperand(i));
23927   }
23928   MIB.setMemRefs(MMOBegin, MMOEnd);
23929   // Reload SP
23930   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
23931   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23932     if (i == X86::AddrDisp)
23933       MIB.addDisp(MI.getOperand(i), SPOffset);
23934     else
23935       MIB.addOperand(MI.getOperand(i));
23936   }
23937   MIB.setMemRefs(MMOBegin, MMOEnd);
23938   // Jump
23939   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
23940
23941   MI.eraseFromParent();
23942   return MBB;
23943 }
23944
23945 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
23946                                                MachineBasicBlock *MBB,
23947                                                MachineBasicBlock *DispatchBB,
23948                                                int FI) const {
23949   DebugLoc DL = MI.getDebugLoc();
23950   MachineFunction *MF = MBB->getParent();
23951   MachineRegisterInfo *MRI = &MF->getRegInfo();
23952   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23953
23954   MVT PVT = getPointerTy(MF->getDataLayout());
23955   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
23956
23957   unsigned Op = 0;
23958   unsigned VR = 0;
23959
23960   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23961                      !isPositionIndependent();
23962
23963   if (UseImmLabel) {
23964     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23965   } else {
23966     const TargetRegisterClass *TRC =
23967         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23968     VR = MRI->createVirtualRegister(TRC);
23969     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23970
23971     /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
23972
23973     if (Subtarget.is64Bit())
23974       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
23975           .addReg(X86::RIP)
23976           .addImm(1)
23977           .addReg(0)
23978           .addMBB(DispatchBB)
23979           .addReg(0);
23980     else
23981       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
23982           .addReg(0) /* XII->getGlobalBaseReg(MF) */
23983           .addImm(1)
23984           .addReg(0)
23985           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
23986           .addReg(0);
23987   }
23988
23989   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
23990   addFrameReference(MIB, FI, 36);
23991   if (UseImmLabel)
23992     MIB.addMBB(DispatchBB);
23993   else
23994     MIB.addReg(VR);
23995 }
23996
23997 MachineBasicBlock *
23998 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
23999                                          MachineBasicBlock *BB) const {
24000   DebugLoc DL = MI.getDebugLoc();
24001   MachineFunction *MF = BB->getParent();
24002   MachineModuleInfo *MMI = &MF->getMMI();
24003   MachineFrameInfo *MFI = MF->getFrameInfo();
24004   MachineRegisterInfo *MRI = &MF->getRegInfo();
24005   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24006   int FI = MFI->getFunctionContextIndex();
24007
24008   // Get a mapping of the call site numbers to all of the landing pads they're
24009   // associated with.
24010   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
24011   unsigned MaxCSNum = 0;
24012   for (auto &MBB : *MF) {
24013     if (!MBB.isEHPad())
24014       continue;
24015
24016     MCSymbol *Sym = nullptr;
24017     for (const auto &MI : MBB) {
24018       if (MI.isDebugValue())
24019         continue;
24020
24021       assert(MI.isEHLabel() && "expected EH_LABEL");
24022       Sym = MI.getOperand(0).getMCSymbol();
24023       break;
24024     }
24025
24026     if (!MMI->hasCallSiteLandingPad(Sym))
24027       continue;
24028
24029     for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
24030       CallSiteNumToLPad[CSI].push_back(&MBB);
24031       MaxCSNum = std::max(MaxCSNum, CSI);
24032     }
24033   }
24034
24035   // Get an ordered list of the machine basic blocks for the jump table.
24036   std::vector<MachineBasicBlock *> LPadList;
24037   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
24038   LPadList.reserve(CallSiteNumToLPad.size());
24039
24040   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
24041     for (auto &LP : CallSiteNumToLPad[CSI]) {
24042       LPadList.push_back(LP);
24043       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
24044     }
24045   }
24046
24047   assert(!LPadList.empty() &&
24048          "No landing pad destinations for the dispatch jump table!");
24049
24050   // Create the MBBs for the dispatch code.
24051
24052   // Shove the dispatch's address into the return slot in the function context.
24053   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
24054   DispatchBB->setIsEHPad(true);
24055
24056   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
24057   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
24058   DispatchBB->addSuccessor(TrapBB);
24059
24060   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
24061   DispatchBB->addSuccessor(DispContBB);
24062
24063   // Insert MBBs.
24064   MF->push_back(DispatchBB);
24065   MF->push_back(DispContBB);
24066   MF->push_back(TrapBB);
24067
24068   // Insert code into the entry block that creates and registers the function
24069   // context.
24070   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
24071
24072   // Create the jump table and associated information
24073   MachineJumpTableInfo *JTI =
24074       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
24075   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
24076
24077   const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
24078   const X86RegisterInfo &RI = XII->getRegisterInfo();
24079
24080   // Add a register mask with no preserved registers.  This results in all
24081   // registers being marked as clobbered.
24082   if (RI.hasBasePointer(*MF)) {
24083     const bool FPIs64Bit =
24084         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
24085     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
24086     MFI->setRestoreBasePointer(MF);
24087
24088     unsigned FP = RI.getFrameRegister(*MF);
24089     unsigned BP = RI.getBaseRegister();
24090     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
24091     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
24092                  MFI->getRestoreBasePointerOffset())
24093         .addRegMask(RI.getNoPreservedMask());
24094   } else {
24095     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
24096         .addRegMask(RI.getNoPreservedMask());
24097   }
24098
24099   unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24100   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
24101                     4);
24102   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
24103       .addReg(IReg)
24104       .addImm(LPadList.size());
24105   BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
24106
24107   unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24108   BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
24109       .addReg(IReg)
24110       .addImm(1);
24111   BuildMI(DispContBB, DL,
24112           TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
24113       .addReg(0)
24114       .addImm(Subtarget.is64Bit() ? 8 : 4)
24115       .addReg(JReg)
24116       .addJumpTableIndex(MJTI)
24117       .addReg(0);
24118
24119   // Add the jump table entries as successors to the MBB.
24120   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
24121   for (auto &LP : LPadList)
24122     if (SeenMBBs.insert(LP).second)
24123       DispContBB->addSuccessor(LP);
24124
24125   // N.B. the order the invoke BBs are processed in doesn't matter here.
24126   SmallVector<MachineBasicBlock *, 64> MBBLPads;
24127   const MCPhysReg *SavedRegs =
24128       Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
24129   for (MachineBasicBlock *MBB : InvokeBBs) {
24130     // Remove the landing pad successor from the invoke block and replace it
24131     // with the new dispatch block.
24132     // Keep a copy of Successors since it's modified inside the loop.
24133     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
24134                                                    MBB->succ_rend());
24135     // FIXME: Avoid quadratic complexity.
24136     for (auto MBBS : Successors) {
24137       if (MBBS->isEHPad()) {
24138         MBB->removeSuccessor(MBBS);
24139         MBBLPads.push_back(MBBS);
24140       }
24141     }
24142
24143     MBB->addSuccessor(DispatchBB);
24144
24145     // Find the invoke call and mark all of the callee-saved registers as
24146     // 'implicit defined' so that they're spilled.  This prevents code from
24147     // moving instructions to before the EH block, where they will never be
24148     // executed.
24149     for (auto &II : reverse(*MBB)) {
24150       if (!II.isCall())
24151         continue;
24152
24153       DenseMap<unsigned, bool> DefRegs;
24154       for (auto &MOp : II.operands())
24155         if (MOp.isReg())
24156           DefRegs[MOp.getReg()] = true;
24157
24158       MachineInstrBuilder MIB(*MF, &II);
24159       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
24160         unsigned Reg = SavedRegs[RI];
24161         if (!DefRegs[Reg])
24162           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
24163       }
24164
24165       break;
24166     }
24167   }
24168
24169   // Mark all former landing pads as non-landing pads.  The dispatch is the only
24170   // landing pad now.
24171   for (auto &LP : MBBLPads)
24172     LP->setIsEHPad(false);
24173
24174   // The instruction is gone now.
24175   MI.eraseFromParent();
24176   return BB;
24177 }
24178
24179 // Replace 213-type (isel default) FMA3 instructions with 231-type for
24180 // accumulator loops. Writing back to the accumulator allows the coalescer
24181 // to remove extra copies in the loop.
24182 // FIXME: Do this on AVX512.  We don't support 231 variants yet (PR23937).
24183 MachineBasicBlock *
24184 X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
24185                                  MachineBasicBlock *MBB) const {
24186   MachineOperand &AddendOp = MI.getOperand(3);
24187
24188   // Bail out early if the addend isn't a register - we can't switch these.
24189   if (!AddendOp.isReg())
24190     return MBB;
24191
24192   MachineFunction &MF = *MBB->getParent();
24193   MachineRegisterInfo &MRI = MF.getRegInfo();
24194
24195   // Check whether the addend is defined by a PHI:
24196   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
24197   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
24198   if (!AddendDef.isPHI())
24199     return MBB;
24200
24201   // Look for the following pattern:
24202   // loop:
24203   //   %addend = phi [%entry, 0], [%loop, %result]
24204   //   ...
24205   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
24206
24207   // Replace with:
24208   //   loop:
24209   //   %addend = phi [%entry, 0], [%loop, %result]
24210   //   ...
24211   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
24212
24213   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
24214     assert(AddendDef.getOperand(i).isReg());
24215     MachineOperand PHISrcOp = AddendDef.getOperand(i);
24216     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
24217     if (&PHISrcInst == &MI) {
24218       // Found a matching instruction.
24219       unsigned NewFMAOpc = 0;
24220       switch (MI.getOpcode()) {
24221       case X86::VFMADDPDr213r:
24222         NewFMAOpc = X86::VFMADDPDr231r;
24223         break;
24224       case X86::VFMADDPSr213r:
24225         NewFMAOpc = X86::VFMADDPSr231r;
24226         break;
24227       case X86::VFMADDSDr213r:
24228         NewFMAOpc = X86::VFMADDSDr231r;
24229         break;
24230       case X86::VFMADDSSr213r:
24231         NewFMAOpc = X86::VFMADDSSr231r;
24232         break;
24233       case X86::VFMSUBPDr213r:
24234         NewFMAOpc = X86::VFMSUBPDr231r;
24235         break;
24236       case X86::VFMSUBPSr213r:
24237         NewFMAOpc = X86::VFMSUBPSr231r;
24238         break;
24239       case X86::VFMSUBSDr213r:
24240         NewFMAOpc = X86::VFMSUBSDr231r;
24241         break;
24242       case X86::VFMSUBSSr213r:
24243         NewFMAOpc = X86::VFMSUBSSr231r;
24244         break;
24245       case X86::VFNMADDPDr213r:
24246         NewFMAOpc = X86::VFNMADDPDr231r;
24247         break;
24248       case X86::VFNMADDPSr213r:
24249         NewFMAOpc = X86::VFNMADDPSr231r;
24250         break;
24251       case X86::VFNMADDSDr213r:
24252         NewFMAOpc = X86::VFNMADDSDr231r;
24253         break;
24254       case X86::VFNMADDSSr213r:
24255         NewFMAOpc = X86::VFNMADDSSr231r;
24256         break;
24257       case X86::VFNMSUBPDr213r:
24258         NewFMAOpc = X86::VFNMSUBPDr231r;
24259         break;
24260       case X86::VFNMSUBPSr213r:
24261         NewFMAOpc = X86::VFNMSUBPSr231r;
24262         break;
24263       case X86::VFNMSUBSDr213r:
24264         NewFMAOpc = X86::VFNMSUBSDr231r;
24265         break;
24266       case X86::VFNMSUBSSr213r:
24267         NewFMAOpc = X86::VFNMSUBSSr231r;
24268         break;
24269       case X86::VFMADDSUBPDr213r:
24270         NewFMAOpc = X86::VFMADDSUBPDr231r;
24271         break;
24272       case X86::VFMADDSUBPSr213r:
24273         NewFMAOpc = X86::VFMADDSUBPSr231r;
24274         break;
24275       case X86::VFMSUBADDPDr213r:
24276         NewFMAOpc = X86::VFMSUBADDPDr231r;
24277         break;
24278       case X86::VFMSUBADDPSr213r:
24279         NewFMAOpc = X86::VFMSUBADDPSr231r;
24280         break;
24281
24282       case X86::VFMADDPDr213rY:
24283         NewFMAOpc = X86::VFMADDPDr231rY;
24284         break;
24285       case X86::VFMADDPSr213rY:
24286         NewFMAOpc = X86::VFMADDPSr231rY;
24287         break;
24288       case X86::VFMSUBPDr213rY:
24289         NewFMAOpc = X86::VFMSUBPDr231rY;
24290         break;
24291       case X86::VFMSUBPSr213rY:
24292         NewFMAOpc = X86::VFMSUBPSr231rY;
24293         break;
24294       case X86::VFNMADDPDr213rY:
24295         NewFMAOpc = X86::VFNMADDPDr231rY;
24296         break;
24297       case X86::VFNMADDPSr213rY:
24298         NewFMAOpc = X86::VFNMADDPSr231rY;
24299         break;
24300       case X86::VFNMSUBPDr213rY:
24301         NewFMAOpc = X86::VFNMSUBPDr231rY;
24302         break;
24303       case X86::VFNMSUBPSr213rY:
24304         NewFMAOpc = X86::VFNMSUBPSr231rY;
24305         break;
24306       case X86::VFMADDSUBPDr213rY:
24307         NewFMAOpc = X86::VFMADDSUBPDr231rY;
24308         break;
24309       case X86::VFMADDSUBPSr213rY:
24310         NewFMAOpc = X86::VFMADDSUBPSr231rY;
24311         break;
24312       case X86::VFMSUBADDPDr213rY:
24313         NewFMAOpc = X86::VFMSUBADDPDr231rY;
24314         break;
24315       case X86::VFMSUBADDPSr213rY:
24316         NewFMAOpc = X86::VFMSUBADDPSr231rY;
24317         break;
24318       default:
24319         llvm_unreachable("Unrecognized FMA variant.");
24320       }
24321
24322       const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
24323       MachineInstrBuilder MIB =
24324           BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
24325               .addOperand(MI.getOperand(0))
24326               .addOperand(MI.getOperand(3))
24327               .addOperand(MI.getOperand(2))
24328               .addOperand(MI.getOperand(1));
24329       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
24330       MI.eraseFromParent();
24331     }
24332   }
24333
24334   return MBB;
24335 }
24336
24337 MachineBasicBlock *
24338 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
24339                                                MachineBasicBlock *BB) const {
24340   switch (MI.getOpcode()) {
24341   default: llvm_unreachable("Unexpected instr type to insert");
24342   case X86::TAILJMPd64:
24343   case X86::TAILJMPr64:
24344   case X86::TAILJMPm64:
24345   case X86::TAILJMPd64_REX:
24346   case X86::TAILJMPr64_REX:
24347   case X86::TAILJMPm64_REX:
24348     llvm_unreachable("TAILJMP64 would not be touched here.");
24349   case X86::TCRETURNdi64:
24350   case X86::TCRETURNri64:
24351   case X86::TCRETURNmi64:
24352     return BB;
24353   case X86::TLS_addr32:
24354   case X86::TLS_addr64:
24355   case X86::TLS_base_addr32:
24356   case X86::TLS_base_addr64:
24357     return EmitLoweredTLSAddr(MI, BB);
24358   case X86::CATCHRET:
24359     return EmitLoweredCatchRet(MI, BB);
24360   case X86::CATCHPAD:
24361     return EmitLoweredCatchPad(MI, BB);
24362   case X86::SEG_ALLOCA_32:
24363   case X86::SEG_ALLOCA_64:
24364     return EmitLoweredSegAlloca(MI, BB);
24365   case X86::TLSCall_32:
24366   case X86::TLSCall_64:
24367     return EmitLoweredTLSCall(MI, BB);
24368   case X86::CMOV_FR32:
24369   case X86::CMOV_FR64:
24370   case X86::CMOV_FR128:
24371   case X86::CMOV_GR8:
24372   case X86::CMOV_GR16:
24373   case X86::CMOV_GR32:
24374   case X86::CMOV_RFP32:
24375   case X86::CMOV_RFP64:
24376   case X86::CMOV_RFP80:
24377   case X86::CMOV_V2F64:
24378   case X86::CMOV_V2I64:
24379   case X86::CMOV_V4F32:
24380   case X86::CMOV_V4F64:
24381   case X86::CMOV_V4I64:
24382   case X86::CMOV_V16F32:
24383   case X86::CMOV_V8F32:
24384   case X86::CMOV_V8F64:
24385   case X86::CMOV_V8I64:
24386   case X86::CMOV_V8I1:
24387   case X86::CMOV_V16I1:
24388   case X86::CMOV_V32I1:
24389   case X86::CMOV_V64I1:
24390     return EmitLoweredSelect(MI, BB);
24391
24392   case X86::RDFLAGS32:
24393   case X86::RDFLAGS64: {
24394     DebugLoc DL = MI.getDebugLoc();
24395     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24396     unsigned PushF =
24397         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
24398     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
24399     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
24400     // Permit reads of the FLAGS register without it being defined.
24401     // This intrinsic exists to read external processor state in flags, such as
24402     // the trap flag, interrupt flag, and direction flag, none of which are
24403     // modeled by the backend.
24404     Push->getOperand(2).setIsUndef();
24405     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
24406
24407     MI.eraseFromParent(); // The pseudo is gone now.
24408     return BB;
24409   }
24410
24411   case X86::WRFLAGS32:
24412   case X86::WRFLAGS64: {
24413     DebugLoc DL = MI.getDebugLoc();
24414     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24415     unsigned Push =
24416         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
24417     unsigned PopF =
24418         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
24419     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
24420     BuildMI(*BB, MI, DL, TII->get(PopF));
24421
24422     MI.eraseFromParent(); // The pseudo is gone now.
24423     return BB;
24424   }
24425
24426   case X86::RELEASE_FADD32mr:
24427   case X86::RELEASE_FADD64mr:
24428     return EmitLoweredAtomicFP(MI, BB);
24429
24430   case X86::FP32_TO_INT16_IN_MEM:
24431   case X86::FP32_TO_INT32_IN_MEM:
24432   case X86::FP32_TO_INT64_IN_MEM:
24433   case X86::FP64_TO_INT16_IN_MEM:
24434   case X86::FP64_TO_INT32_IN_MEM:
24435   case X86::FP64_TO_INT64_IN_MEM:
24436   case X86::FP80_TO_INT16_IN_MEM:
24437   case X86::FP80_TO_INT32_IN_MEM:
24438   case X86::FP80_TO_INT64_IN_MEM: {
24439     MachineFunction *F = BB->getParent();
24440     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24441     DebugLoc DL = MI.getDebugLoc();
24442
24443     // Change the floating point control register to use "round towards zero"
24444     // mode when truncating to an integer value.
24445     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
24446     addFrameReference(BuildMI(*BB, MI, DL,
24447                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
24448
24449     // Load the old value of the high byte of the control word...
24450     unsigned OldCW =
24451       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
24452     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
24453                       CWFrameIdx);
24454
24455     // Set the high part to be round to zero...
24456     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
24457       .addImm(0xC7F);
24458
24459     // Reload the modified control word now...
24460     addFrameReference(BuildMI(*BB, MI, DL,
24461                               TII->get(X86::FLDCW16m)), CWFrameIdx);
24462
24463     // Restore the memory image of control word to original value
24464     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
24465       .addReg(OldCW);
24466
24467     // Get the X86 opcode to use.
24468     unsigned Opc;
24469     switch (MI.getOpcode()) {
24470     default: llvm_unreachable("illegal opcode!");
24471     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
24472     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
24473     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
24474     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
24475     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
24476     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
24477     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
24478     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
24479     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
24480     }
24481
24482     X86AddressMode AM = getAddressFromInstr(&MI, 0);
24483     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
24484         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
24485
24486     // Reload the original control word now.
24487     addFrameReference(BuildMI(*BB, MI, DL,
24488                               TII->get(X86::FLDCW16m)), CWFrameIdx);
24489
24490     MI.eraseFromParent(); // The pseudo instruction is gone now.
24491     return BB;
24492   }
24493     // String/text processing lowering.
24494   case X86::PCMPISTRM128REG:
24495   case X86::VPCMPISTRM128REG:
24496   case X86::PCMPISTRM128MEM:
24497   case X86::VPCMPISTRM128MEM:
24498   case X86::PCMPESTRM128REG:
24499   case X86::VPCMPESTRM128REG:
24500   case X86::PCMPESTRM128MEM:
24501   case X86::VPCMPESTRM128MEM:
24502     assert(Subtarget.hasSSE42() &&
24503            "Target must have SSE4.2 or AVX features enabled");
24504     return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
24505
24506   // String/text processing lowering.
24507   case X86::PCMPISTRIREG:
24508   case X86::VPCMPISTRIREG:
24509   case X86::PCMPISTRIMEM:
24510   case X86::VPCMPISTRIMEM:
24511   case X86::PCMPESTRIREG:
24512   case X86::VPCMPESTRIREG:
24513   case X86::PCMPESTRIMEM:
24514   case X86::VPCMPESTRIMEM:
24515     assert(Subtarget.hasSSE42() &&
24516            "Target must have SSE4.2 or AVX features enabled");
24517     return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
24518
24519   // Thread synchronization.
24520   case X86::MONITOR:
24521     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
24522   case X86::MONITORX:
24523     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
24524   // PKU feature
24525   case X86::WRPKRU:
24526     return emitWRPKRU(MI, BB, Subtarget);
24527   case X86::RDPKRU:
24528     return emitRDPKRU(MI, BB, Subtarget);
24529   // xbegin
24530   case X86::XBEGIN:
24531     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
24532
24533   case X86::VASTART_SAVE_XMM_REGS:
24534     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
24535
24536   case X86::VAARG_64:
24537     return EmitVAARG64WithCustomInserter(MI, BB);
24538
24539   case X86::EH_SjLj_SetJmp32:
24540   case X86::EH_SjLj_SetJmp64:
24541     return emitEHSjLjSetJmp(MI, BB);
24542
24543   case X86::EH_SjLj_LongJmp32:
24544   case X86::EH_SjLj_LongJmp64:
24545     return emitEHSjLjLongJmp(MI, BB);
24546
24547   case X86::Int_eh_sjlj_setup_dispatch:
24548     return EmitSjLjDispatchBlock(MI, BB);
24549
24550   case TargetOpcode::STATEPOINT:
24551     // As an implementation detail, STATEPOINT shares the STACKMAP format at
24552     // this point in the process.  We diverge later.
24553     return emitPatchPoint(MI, BB);
24554
24555   case TargetOpcode::STACKMAP:
24556   case TargetOpcode::PATCHPOINT:
24557     return emitPatchPoint(MI, BB);
24558
24559   case X86::VFMADDPDr213r:
24560   case X86::VFMADDPSr213r:
24561   case X86::VFMADDSDr213r:
24562   case X86::VFMADDSSr213r:
24563   case X86::VFMSUBPDr213r:
24564   case X86::VFMSUBPSr213r:
24565   case X86::VFMSUBSDr213r:
24566   case X86::VFMSUBSSr213r:
24567   case X86::VFNMADDPDr213r:
24568   case X86::VFNMADDPSr213r:
24569   case X86::VFNMADDSDr213r:
24570   case X86::VFNMADDSSr213r:
24571   case X86::VFNMSUBPDr213r:
24572   case X86::VFNMSUBPSr213r:
24573   case X86::VFNMSUBSDr213r:
24574   case X86::VFNMSUBSSr213r:
24575   case X86::VFMADDSUBPDr213r:
24576   case X86::VFMADDSUBPSr213r:
24577   case X86::VFMSUBADDPDr213r:
24578   case X86::VFMSUBADDPSr213r:
24579   case X86::VFMADDPDr213rY:
24580   case X86::VFMADDPSr213rY:
24581   case X86::VFMSUBPDr213rY:
24582   case X86::VFMSUBPSr213rY:
24583   case X86::VFNMADDPDr213rY:
24584   case X86::VFNMADDPSr213rY:
24585   case X86::VFNMSUBPDr213rY:
24586   case X86::VFNMSUBPSr213rY:
24587   case X86::VFMADDSUBPDr213rY:
24588   case X86::VFMADDSUBPSr213rY:
24589   case X86::VFMSUBADDPDr213rY:
24590   case X86::VFMSUBADDPSr213rY:
24591     return emitFMA3Instr(MI, BB);
24592   case X86::LCMPXCHG8B_SAVE_EBX:
24593   case X86::LCMPXCHG16B_SAVE_RBX: {
24594     unsigned BasePtr =
24595         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
24596     if (!BB->isLiveIn(BasePtr))
24597       BB->addLiveIn(BasePtr);
24598     return BB;
24599   }
24600   }
24601 }
24602
24603 //===----------------------------------------------------------------------===//
24604 //                           X86 Optimization Hooks
24605 //===----------------------------------------------------------------------===//
24606
24607 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
24608                                                       APInt &KnownZero,
24609                                                       APInt &KnownOne,
24610                                                       const SelectionDAG &DAG,
24611                                                       unsigned Depth) const {
24612   unsigned BitWidth = KnownZero.getBitWidth();
24613   unsigned Opc = Op.getOpcode();
24614   assert((Opc >= ISD::BUILTIN_OP_END ||
24615           Opc == ISD::INTRINSIC_WO_CHAIN ||
24616           Opc == ISD::INTRINSIC_W_CHAIN ||
24617           Opc == ISD::INTRINSIC_VOID) &&
24618          "Should use MaskedValueIsZero if you don't know whether Op"
24619          " is a target node!");
24620
24621   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
24622   switch (Opc) {
24623   default: break;
24624   case X86ISD::ADD:
24625   case X86ISD::SUB:
24626   case X86ISD::ADC:
24627   case X86ISD::SBB:
24628   case X86ISD::SMUL:
24629   case X86ISD::UMUL:
24630   case X86ISD::INC:
24631   case X86ISD::DEC:
24632   case X86ISD::OR:
24633   case X86ISD::XOR:
24634   case X86ISD::AND:
24635     // These nodes' second result is a boolean.
24636     if (Op.getResNo() == 0)
24637       break;
24638     // Fallthrough
24639   case X86ISD::SETCC:
24640     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
24641     break;
24642   case X86ISD::MOVMSK: {
24643     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
24644     KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
24645     break;
24646   }
24647   }
24648 }
24649
24650 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
24651   SDValue Op,
24652   const SelectionDAG &,
24653   unsigned Depth) const {
24654   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
24655   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
24656     return Op.getValueType().getScalarSizeInBits();
24657
24658   // Fallback case.
24659   return 1;
24660 }
24661
24662 /// Returns true (and the GlobalValue and the offset) if the node is a
24663 /// GlobalAddress + offset.
24664 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
24665                                        const GlobalValue* &GA,
24666                                        int64_t &Offset) const {
24667   if (N->getOpcode() == X86ISD::Wrapper) {
24668     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
24669       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
24670       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
24671       return true;
24672     }
24673   }
24674   return TargetLowering::isGAPlusOffset(N, GA, Offset);
24675 }
24676
24677 /// Performs shuffle combines for 256-bit vectors.
24678 /// FIXME: This could be expanded to support 512 bit vectors as well.
24679 static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
24680                                  TargetLowering::DAGCombinerInfo &DCI,
24681                                  const X86Subtarget &Subtarget) {
24682   SDLoc dl(N);
24683   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
24684   SDValue V1 = SVOp->getOperand(0);
24685   SDValue V2 = SVOp->getOperand(1);
24686   MVT VT = SVOp->getSimpleValueType(0);
24687   unsigned NumElems = VT.getVectorNumElements();
24688
24689   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
24690       V2.getOpcode() == ISD::CONCAT_VECTORS) {
24691     //
24692     //                   0,0,0,...
24693     //                      |
24694     //    V      UNDEF    BUILD_VECTOR    UNDEF
24695     //     \      /           \           /
24696     //  CONCAT_VECTOR         CONCAT_VECTOR
24697     //         \                  /
24698     //          \                /
24699     //          RESULT: V + zero extended
24700     //
24701     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
24702         !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
24703       return SDValue();
24704
24705     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
24706       return SDValue();
24707
24708     // To match the shuffle mask, the first half of the mask should
24709     // be exactly the first vector, and all the rest a splat with the
24710     // first element of the second one.
24711     for (unsigned i = 0; i != NumElems/2; ++i)
24712       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
24713           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
24714         return SDValue();
24715
24716     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
24717     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
24718       if (Ld->hasNUsesOfValue(1, 0)) {
24719         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
24720         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
24721         SDValue ResNode =
24722           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
24723                                   Ld->getMemoryVT(),
24724                                   Ld->getPointerInfo(),
24725                                   Ld->getAlignment(),
24726                                   false/*isVolatile*/, true/*ReadMem*/,
24727                                   false/*WriteMem*/);
24728
24729         // Make sure the newly-created LOAD is in the same position as Ld in
24730         // terms of dependency. We create a TokenFactor for Ld and ResNode,
24731         // and update uses of Ld's output chain to use the TokenFactor.
24732         if (Ld->hasAnyUseOfValue(1)) {
24733           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24734                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
24735           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
24736           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
24737                                  SDValue(ResNode.getNode(), 1));
24738         }
24739
24740         return DAG.getBitcast(VT, ResNode);
24741       }
24742     }
24743
24744     // Emit a zeroed vector and insert the desired subvector on its
24745     // first half.
24746     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
24747     SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
24748     return DCI.CombineTo(N, InsV);
24749   }
24750
24751   return SDValue();
24752 }
24753
24754 // Attempt to match a combined shuffle mask against supported unary shuffle
24755 // instructions.
24756 // TODO: Investigate sharing more of this with shuffle lowering.
24757 static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24758                                     const X86Subtarget &Subtarget,
24759                                     unsigned &Shuffle, MVT &ShuffleVT) {
24760   bool FloatDomain = SrcVT.isFloatingPoint() ||
24761                      (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
24762
24763   // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
24764   if (!FloatDomain && SrcVT.is128BitVector() &&
24765       isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
24766     Shuffle = X86ISD::VZEXT_MOVL;
24767     ShuffleVT = MVT::v2i64;
24768     return true;
24769   }
24770
24771   // Check if we have SSE3 which will let us use MOVDDUP etc. The
24772   // instructions are no slower than UNPCKLPD but has the option to
24773   // fold the input operand into even an unaligned memory load.
24774   if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
24775     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
24776       Shuffle = X86ISD::MOVDDUP;
24777       ShuffleVT = MVT::v2f64;
24778       return true;
24779     }
24780     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24781       Shuffle = X86ISD::MOVSLDUP;
24782       ShuffleVT = MVT::v4f32;
24783       return true;
24784     }
24785     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
24786       Shuffle = X86ISD::MOVSHDUP;
24787       ShuffleVT = MVT::v4f32;
24788       return true;
24789     }
24790   }
24791
24792   if (SrcVT.is256BitVector() && FloatDomain) {
24793     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
24794     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24795       Shuffle = X86ISD::MOVDDUP;
24796       ShuffleVT = MVT::v4f64;
24797       return true;
24798     }
24799     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24800       Shuffle = X86ISD::MOVSLDUP;
24801       ShuffleVT = MVT::v8f32;
24802       return true;
24803     }
24804     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
24805       Shuffle = X86ISD::MOVSHDUP;
24806       ShuffleVT = MVT::v8f32;
24807       return true;
24808     }
24809   }
24810
24811   if (SrcVT.is512BitVector() && FloatDomain) {
24812     assert(Subtarget.hasAVX512() &&
24813            "AVX512 required for 512-bit vector shuffles");
24814     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24815       Shuffle = X86ISD::MOVDDUP;
24816       ShuffleVT = MVT::v8f64;
24817       return true;
24818     }
24819     if (isTargetShuffleEquivalent(
24820             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
24821       Shuffle = X86ISD::MOVSLDUP;
24822       ShuffleVT = MVT::v16f32;
24823       return true;
24824     }
24825     if (isTargetShuffleEquivalent(
24826             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
24827       Shuffle = X86ISD::MOVSHDUP;
24828       ShuffleVT = MVT::v16f32;
24829       return true;
24830     }
24831   }
24832
24833   // Attempt to match against broadcast-from-vector.
24834   if (Subtarget.hasAVX2()) {
24835     unsigned NumElts = Mask.size();
24836     SmallVector<int, 64> BroadcastMask(NumElts, 0);
24837     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
24838       unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
24839       ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
24840                               : MVT::getIntegerVT(EltSize);
24841       ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
24842       Shuffle = X86ISD::VBROADCAST;
24843       return true;
24844     }
24845   }
24846
24847   return false;
24848 }
24849
24850 // Attempt to match a combined shuffle mask against supported unary immediate
24851 // permute instructions.
24852 // TODO: Investigate sharing more of this with shuffle lowering.
24853 static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24854                                       const X86Subtarget &Subtarget,
24855                                       unsigned &Shuffle, MVT &ShuffleVT,
24856                                       unsigned &PermuteImm) {
24857   // Ensure we don't contain any zero elements.
24858   for (int M : Mask) {
24859     if (M == SM_SentinelZero)
24860       return false;
24861     assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
24862            "Expected unary shuffle");
24863   }
24864
24865   unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
24866   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
24867
24868   // Handle PSHUFLW/PSHUFHW repeated patterns.
24869   if (MaskScalarSizeInBits == 16) {
24870     SmallVector<int, 4> RepeatedMask;
24871     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
24872       ArrayRef<int> LoMask(Mask.data() + 0, 4);
24873       ArrayRef<int> HiMask(Mask.data() + 4, 4);
24874
24875       // PSHUFLW: permute lower 4 elements only.
24876       if (isUndefOrInRange(LoMask, 0, 4) &&
24877           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
24878         Shuffle = X86ISD::PSHUFLW;
24879         ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24880         PermuteImm = getV4X86ShuffleImm(LoMask);
24881         return true;
24882       }
24883
24884       // PSHUFHW: permute upper 4 elements only.
24885       if (isUndefOrInRange(HiMask, 4, 8) &&
24886           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
24887         // Offset the HiMask so that we can create the shuffle immediate.
24888         int OffsetHiMask[4];
24889         for (int i = 0; i != 4; ++i)
24890           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
24891
24892         Shuffle = X86ISD::PSHUFHW;
24893         ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24894         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
24895         return true;
24896       }
24897
24898       return false;
24899     }
24900     return false;
24901   }
24902
24903   // We only support permutation of 32/64 bit elements after this.
24904   if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
24905     return false;
24906
24907   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
24908   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
24909   bool FloatDomain = SrcVT.isFloatingPoint();
24910   if (FloatDomain && !Subtarget.hasAVX())
24911     return false;
24912
24913   // Pre-AVX2 we must use float shuffles on 256-bit vectors.
24914   if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
24915     FloatDomain = true;
24916
24917   // Check for lane crossing permutes.
24918   if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
24919     // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
24920     if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
24921       Shuffle = X86ISD::VPERMI;
24922       ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
24923       PermuteImm = getV4X86ShuffleImm(Mask);
24924       return true;
24925     }
24926     if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
24927       SmallVector<int, 4> RepeatedMask;
24928       if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
24929         Shuffle = X86ISD::VPERMI;
24930         ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
24931         PermuteImm = getV4X86ShuffleImm(RepeatedMask);
24932         return true;
24933       }
24934     }
24935     return false;
24936   }
24937
24938   // VPERMILPD can permute with a non-repeating shuffle.
24939   if (FloatDomain && MaskScalarSizeInBits == 64) {
24940     Shuffle = X86ISD::VPERMILPI;
24941     ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
24942     PermuteImm = 0;
24943     for (int i = 0, e = Mask.size(); i != e; ++i) {
24944       int M = Mask[i];
24945       if (M == SM_SentinelUndef)
24946         continue;
24947       assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
24948       PermuteImm |= (M & 1) << i;
24949     }
24950     return true;
24951   }
24952
24953   // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
24954   SmallVector<int, 4> RepeatedMask;
24955   if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
24956     return false;
24957
24958   // Narrow the repeated mask for 32-bit element permutes.
24959   SmallVector<int, 4> WordMask = RepeatedMask;
24960   if (MaskScalarSizeInBits == 64)
24961     scaleShuffleMask(2, RepeatedMask, WordMask);
24962
24963   Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
24964   ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
24965   ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
24966   PermuteImm = getV4X86ShuffleImm(WordMask);
24967   return true;
24968 }
24969
24970 // Attempt to match a combined unary shuffle mask against supported binary
24971 // shuffle instructions.
24972 // TODO: Investigate sharing more of this with shuffle lowering.
24973 static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24974                                      unsigned &Shuffle, MVT &ShuffleVT) {
24975   bool FloatDomain = SrcVT.isFloatingPoint();
24976
24977   if (SrcVT.is128BitVector()) {
24978     if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
24979       Shuffle = X86ISD::MOVLHPS;
24980       ShuffleVT = MVT::v4f32;
24981       return true;
24982     }
24983     if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
24984       Shuffle = X86ISD::MOVHLPS;
24985       ShuffleVT = MVT::v4f32;
24986       return true;
24987     }
24988     if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
24989       Shuffle = X86ISD::UNPCKL;
24990       ShuffleVT = MVT::v4f32;
24991       return true;
24992     }
24993     if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
24994       Shuffle = X86ISD::UNPCKH;
24995       ShuffleVT = MVT::v4f32;
24996       return true;
24997     }
24998     if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
24999         isTargetShuffleEquivalent(
25000             Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
25001       Shuffle = X86ISD::UNPCKL;
25002       ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
25003       return true;
25004     }
25005     if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
25006         isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
25007                                          13, 14, 14, 15, 15})) {
25008       Shuffle = X86ISD::UNPCKH;
25009       ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
25010       return true;
25011     }
25012   }
25013
25014   return false;
25015 }
25016
25017 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
25018 /// possible.
25019 ///
25020 /// This is the leaf of the recursive combine below. When we have found some
25021 /// chain of single-use x86 shuffle instructions and accumulated the combined
25022 /// shuffle mask represented by them, this will try to pattern match that mask
25023 /// into either a single instruction if there is a special purpose instruction
25024 /// for this operation, or into a PSHUFB instruction which is a fully general
25025 /// instruction but should only be used to replace chains over a certain depth.
25026 static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
25027                                    ArrayRef<int> BaseMask, int Depth,
25028                                    bool HasVariableMask, SelectionDAG &DAG,
25029                                    TargetLowering::DAGCombinerInfo &DCI,
25030                                    const X86Subtarget &Subtarget) {
25031   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
25032
25033   // Find the operand that enters the chain. Note that multiple uses are OK
25034   // here, we're not going to remove the operand we find.
25035   Input = peekThroughBitcasts(Input);
25036
25037   MVT VT = Input.getSimpleValueType();
25038   MVT RootVT = Root.getSimpleValueType();
25039   SDLoc DL(Root);
25040
25041   SDValue Res;
25042
25043   unsigned NumBaseMaskElts = BaseMask.size();
25044   if (NumBaseMaskElts == 1) {
25045     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
25046     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
25047                   /*AddTo*/ true);
25048     return true;
25049   }
25050
25051   unsigned RootSizeInBits = RootVT.getSizeInBits();
25052   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
25053
25054   // Don't combine if we are a AVX512/EVEX target and the mask element size
25055   // is different from the root element size - this would prevent writemasks
25056   // from being reused.
25057   // TODO - this currently prevents all lane shuffles from occurring.
25058   // TODO - check for writemasks usage instead of always preventing combining.
25059   // TODO - attempt to narrow Mask back to writemask size.
25060   if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
25061       (RootSizeInBits == 512 ||
25062        (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
25063     return false;
25064   }
25065
25066   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
25067
25068   // Handle 128-bit lane shuffles of 256-bit vectors.
25069   if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
25070       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
25071     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
25072       return false; // Nothing to do!
25073     MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
25074                                                                   : MVT::v4i64);
25075     unsigned PermMask = 0;
25076     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
25077     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
25078
25079     Res = DAG.getBitcast(ShuffleVT, Input);
25080     DCI.AddToWorklist(Res.getNode());
25081     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
25082                       DAG.getUNDEF(ShuffleVT),
25083                       DAG.getConstant(PermMask, DL, MVT::i8));
25084     DCI.AddToWorklist(Res.getNode());
25085     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25086                   /*AddTo*/ true);
25087     return true;
25088   }
25089
25090   // For masks that have been widened to 128-bit elements or more,
25091   // narrow back down to 64-bit elements.
25092   SmallVector<int, 64> Mask;
25093   if (BaseMaskEltSizeInBits > 64) {
25094     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
25095     int MaskScale = BaseMaskEltSizeInBits / 64;
25096     scaleShuffleMask(MaskScale, BaseMask, Mask);
25097   } else {
25098     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
25099   }
25100
25101   unsigned NumMaskElts = Mask.size();
25102   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
25103
25104   // Determine the effective mask value type.
25105   bool FloatDomain =
25106       (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
25107       (32 <= MaskEltSizeInBits);
25108   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
25109                            : MVT::getIntegerVT(MaskEltSizeInBits);
25110   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
25111
25112   // Attempt to match the mask against known shuffle patterns.
25113   MVT ShuffleVT;
25114   unsigned Shuffle, PermuteImm;
25115
25116   if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
25117     if (Depth == 1 && Root.getOpcode() == Shuffle)
25118       return false; // Nothing to do!
25119     Res = DAG.getBitcast(ShuffleVT, Input);
25120     DCI.AddToWorklist(Res.getNode());
25121     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
25122     DCI.AddToWorklist(Res.getNode());
25123     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25124                   /*AddTo*/ true);
25125     return true;
25126   }
25127
25128   if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
25129                                 PermuteImm)) {
25130     if (Depth == 1 && Root.getOpcode() == Shuffle)
25131       return false; // Nothing to do!
25132     Res = DAG.getBitcast(ShuffleVT, Input);
25133     DCI.AddToWorklist(Res.getNode());
25134     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
25135                       DAG.getConstant(PermuteImm, DL, MVT::i8));
25136     DCI.AddToWorklist(Res.getNode());
25137     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25138                   /*AddTo*/ true);
25139     return true;
25140   }
25141
25142   if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
25143     if (Depth == 1 && Root.getOpcode() == Shuffle)
25144       return false; // Nothing to do!
25145     Res = DAG.getBitcast(ShuffleVT, Input);
25146     DCI.AddToWorklist(Res.getNode());
25147     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
25148     DCI.AddToWorklist(Res.getNode());
25149     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25150                   /*AddTo*/ true);
25151     return true;
25152   }
25153
25154   // Attempt to blend with zero.
25155   if (NumMaskElts <= 8 &&
25156       ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
25157        (Subtarget.hasAVX() && VT.is256BitVector()))) {
25158     // Convert VT to a type compatible with X86ISD::BLENDI.
25159     // TODO - add 16i16 support (requires lane duplication).
25160     MVT ShuffleVT = MaskVT;
25161     if (Subtarget.hasAVX2()) {
25162       if (ShuffleVT == MVT::v4i64)
25163         ShuffleVT = MVT::v8i32;
25164       else if (ShuffleVT == MVT::v2i64)
25165         ShuffleVT = MVT::v4i32;
25166     } else {
25167       if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
25168         ShuffleVT = MVT::v8i16;
25169       else if (ShuffleVT == MVT::v4i64)
25170         ShuffleVT = MVT::v4f64;
25171       else if (ShuffleVT == MVT::v8i32)
25172         ShuffleVT = MVT::v8f32;
25173     }
25174
25175     if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
25176                                          /*Low*/ 0) &&
25177         NumMaskElts <= ShuffleVT.getVectorNumElements()) {
25178       unsigned BlendMask = 0;
25179       unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
25180       unsigned MaskRatio = ShuffleSize / NumMaskElts;
25181
25182       if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
25183         return false;
25184
25185       for (unsigned i = 0; i != ShuffleSize; ++i)
25186         if (Mask[i / MaskRatio] < 0)
25187           BlendMask |= 1u << i;
25188
25189       SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
25190       Res = DAG.getBitcast(ShuffleVT, Input);
25191       DCI.AddToWorklist(Res.getNode());
25192       Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
25193                         DAG.getConstant(BlendMask, DL, MVT::i8));
25194       DCI.AddToWorklist(Res.getNode());
25195       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25196                     /*AddTo*/ true);
25197       return true;
25198     }
25199   }
25200
25201   // Attempt to combine to INSERTPS.
25202   if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
25203       (VT == MVT::v2f64 || VT == MVT::v4f32)) {
25204     SmallBitVector Zeroable(4, false);
25205     for (unsigned i = 0; i != NumMaskElts; ++i)
25206       if (Mask[i] < 0)
25207         Zeroable[i] = true;
25208
25209     unsigned InsertPSMask;
25210     SDValue V1 = Input, V2 = Input;
25211     if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
25212                                                        Zeroable, Mask, DAG)) {
25213       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
25214         return false; // Nothing to do!
25215       V1 = DAG.getBitcast(MVT::v4f32, V1);
25216       DCI.AddToWorklist(V1.getNode());
25217       V2 = DAG.getBitcast(MVT::v4f32, V2);
25218       DCI.AddToWorklist(V2.getNode());
25219       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
25220                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
25221       DCI.AddToWorklist(Res.getNode());
25222       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25223                     /*AddTo*/ true);
25224       return true;
25225     }
25226   }
25227
25228   // Don't try to re-form single instruction chains under any circumstances now
25229   // that we've done encoding canonicalization for them.
25230   if (Depth < 2)
25231     return false;
25232
25233   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
25234     return false;
25235
25236   bool MaskContainsZeros =
25237       llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
25238
25239   // If we have a single input shuffle with different shuffle patterns in the
25240   // the 128-bit lanes use the variable mask to VPERMILPS.
25241   // TODO Combine other mask types at higher depths.
25242   if (HasVariableMask && !MaskContainsZeros &&
25243       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
25244        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
25245     SmallVector<SDValue, 16> VPermIdx;
25246     for (int M : Mask) {
25247       SDValue Idx =
25248           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
25249       VPermIdx.push_back(Idx);
25250     }
25251     MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
25252     SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
25253     DCI.AddToWorklist(VPermMask.getNode());
25254     Res = DAG.getBitcast(MaskVT, Input);
25255     DCI.AddToWorklist(Res.getNode());
25256     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
25257     DCI.AddToWorklist(Res.getNode());
25258     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25259                   /*AddTo*/ true);
25260     return true;
25261   }
25262
25263   // If we have 3 or more shuffle instructions or a chain involving a variable
25264   // mask, we can replace them with a single PSHUFB instruction profitably.
25265   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
25266   // instructions, but in practice PSHUFB tends to be *very* fast so we're
25267   // more aggressive.
25268   if ((Depth >= 3 || HasVariableMask) &&
25269       ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
25270        (VT.is256BitVector() && Subtarget.hasAVX2()) ||
25271        (VT.is512BitVector() && Subtarget.hasBWI()))) {
25272     SmallVector<SDValue, 16> PSHUFBMask;
25273     int NumBytes = VT.getSizeInBits() / 8;
25274     int Ratio = NumBytes / NumMaskElts;
25275     for (int i = 0; i < NumBytes; ++i) {
25276       int M = Mask[i / Ratio];
25277       if (M == SM_SentinelUndef) {
25278         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
25279         continue;
25280       }
25281       if (M == SM_SentinelZero) {
25282         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
25283         continue;
25284       }
25285       M = Ratio * M + i % Ratio;
25286       assert ((M / 16) == (i / 16) && "Lane crossing detected");
25287       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
25288     }
25289     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
25290     Res = DAG.getBitcast(ByteVT, Input);
25291     DCI.AddToWorklist(Res.getNode());
25292     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
25293     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
25294     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
25295     DCI.AddToWorklist(Res.getNode());
25296     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25297                   /*AddTo*/ true);
25298     return true;
25299   }
25300
25301   // Failed to find any combines.
25302   return false;
25303 }
25304
25305 /// \brief Fully generic combining of x86 shuffle instructions.
25306 ///
25307 /// This should be the last combine run over the x86 shuffle instructions. Once
25308 /// they have been fully optimized, this will recursively consider all chains
25309 /// of single-use shuffle instructions, build a generic model of the cumulative
25310 /// shuffle operation, and check for simpler instructions which implement this
25311 /// operation. We use this primarily for two purposes:
25312 ///
25313 /// 1) Collapse generic shuffles to specialized single instructions when
25314 ///    equivalent. In most cases, this is just an encoding size win, but
25315 ///    sometimes we will collapse multiple generic shuffles into a single
25316 ///    special-purpose shuffle.
25317 /// 2) Look for sequences of shuffle instructions with 3 or more total
25318 ///    instructions, and replace them with the slightly more expensive SSSE3
25319 ///    PSHUFB instruction if available. We do this as the last combining step
25320 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
25321 ///    a suitable short sequence of other instructions. The PHUFB will either
25322 ///    use a register or have to read from memory and so is slightly (but only
25323 ///    slightly) more expensive than the other shuffle instructions.
25324 ///
25325 /// Because this is inherently a quadratic operation (for each shuffle in
25326 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
25327 /// This should never be an issue in practice as the shuffle lowering doesn't
25328 /// produce sequences of more than 8 instructions.
25329 ///
25330 /// FIXME: We will currently miss some cases where the redundant shuffling
25331 /// would simplify under the threshold for PSHUFB formation because of
25332 /// combine-ordering. To fix this, we should do the redundant instruction
25333 /// combining in this recursive walk.
25334 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
25335                                           ArrayRef<int> RootMask,
25336                                           int Depth, bool HasVariableMask,
25337                                           SelectionDAG &DAG,
25338                                           TargetLowering::DAGCombinerInfo &DCI,
25339                                           const X86Subtarget &Subtarget) {
25340   // Bound the depth of our recursive combine because this is ultimately
25341   // quadratic in nature.
25342   if (Depth > 8)
25343     return false;
25344
25345   // Directly rip through bitcasts to find the underlying operand.
25346   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
25347     Op = Op.getOperand(0);
25348
25349   MVT VT = Op.getSimpleValueType();
25350   if (!VT.isVector())
25351     return false; // Bail if we hit a non-vector.
25352
25353   assert(Root.getSimpleValueType().isVector() &&
25354          "Shuffles operate on vector types!");
25355   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
25356          "Can only combine shuffles of the same vector register size.");
25357
25358   // Extract target shuffle mask and resolve sentinels and inputs.
25359   SDValue Input0, Input1;
25360   SmallVector<int, 16> OpMask;
25361   if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
25362     return false;
25363
25364   assert(VT.getVectorNumElements() == OpMask.size() &&
25365          "Different mask size from vector size!");
25366   assert(((RootMask.size() > OpMask.size() &&
25367            RootMask.size() % OpMask.size() == 0) ||
25368           (OpMask.size() > RootMask.size() &&
25369            OpMask.size() % RootMask.size() == 0) ||
25370           OpMask.size() == RootMask.size()) &&
25371          "The smaller number of elements must divide the larger.");
25372   int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
25373   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
25374   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
25375   assert(((RootRatio == 1 && OpRatio == 1) ||
25376           (RootRatio == 1) != (OpRatio == 1)) &&
25377          "Must not have a ratio for both incoming and op masks!");
25378
25379   SmallVector<int, 16> Mask;
25380   Mask.reserve(MaskWidth);
25381
25382   // Merge this shuffle operation's mask into our accumulated mask. Note that
25383   // this shuffle's mask will be the first applied to the input, followed by the
25384   // root mask to get us all the way to the root value arrangement. The reason
25385   // for this order is that we are recursing up the operation chain.
25386   for (int i = 0; i < MaskWidth; ++i) {
25387     int RootIdx = i / RootRatio;
25388     if (RootMask[RootIdx] < 0) {
25389       // This is a zero or undef lane, we're done.
25390       Mask.push_back(RootMask[RootIdx]);
25391       continue;
25392     }
25393
25394     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
25395     int OpIdx = RootMaskedIdx / OpRatio;
25396     if (OpMask[OpIdx] < 0) {
25397       // The incoming lanes are zero or undef, it doesn't matter which ones we
25398       // are using.
25399       Mask.push_back(OpMask[OpIdx]);
25400       continue;
25401     }
25402
25403     // Ok, we have non-zero lanes, map them through.
25404     Mask.push_back(OpMask[OpIdx] * OpRatio +
25405                    RootMaskedIdx % OpRatio);
25406   }
25407
25408   // Handle the all undef/zero cases early.
25409   if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
25410     DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
25411     return true;
25412   }
25413   if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
25414     // TODO - should we handle the mixed zero/undef case as well? Just returning
25415     // a zero mask will lose information on undef elements possibly reducing
25416     // future combine possibilities.
25417     DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
25418                                                 Subtarget, DAG, SDLoc(Root)));
25419     return true;
25420   }
25421
25422   int MaskSize = Mask.size();
25423   bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
25424                   [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
25425   bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
25426                   [MaskSize](int Idx) { return MaskSize <= Idx; });
25427
25428   // At the moment we can only combine unary shuffle mask cases.
25429   if (UseInput0 && UseInput1)
25430     return false;
25431   else if (UseInput1) {
25432     std::swap(Input0, Input1);
25433     ShuffleVectorSDNode::commuteMask(Mask);
25434   }
25435
25436   assert(Input0 && "Shuffle with no inputs detected");
25437
25438   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
25439
25440   // See if we can recurse into Input0 (if it's a target shuffle).
25441   if (Op->isOnlyUserOf(Input0.getNode()) &&
25442       combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
25443                                     HasVariableMask, DAG, DCI, Subtarget))
25444     return true;
25445
25446   // Minor canonicalization of the accumulated shuffle mask to make it easier
25447   // to match below. All this does is detect masks with sequential pairs of
25448   // elements, and shrink them to the half-width mask. It does this in a loop
25449   // so it will reduce the size of the mask to the minimal width mask which
25450   // performs an equivalent shuffle.
25451   SmallVector<int, 16> WidenedMask;
25452   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
25453     Mask = std::move(WidenedMask);
25454   }
25455
25456   return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
25457                                 DCI, Subtarget);
25458 }
25459
25460 /// \brief Get the PSHUF-style mask from PSHUF node.
25461 ///
25462 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
25463 /// PSHUF-style masks that can be reused with such instructions.
25464 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
25465   MVT VT = N.getSimpleValueType();
25466   SmallVector<int, 4> Mask;
25467   SmallVector<SDValue, 2> Ops;
25468   bool IsUnary;
25469   bool HaveMask =
25470       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
25471   (void)HaveMask;
25472   assert(HaveMask);
25473
25474   // If we have more than 128-bits, only the low 128-bits of shuffle mask
25475   // matter. Check that the upper masks are repeats and remove them.
25476   if (VT.getSizeInBits() > 128) {
25477     int LaneElts = 128 / VT.getScalarSizeInBits();
25478 #ifndef NDEBUG
25479     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
25480       for (int j = 0; j < LaneElts; ++j)
25481         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
25482                "Mask doesn't repeat in high 128-bit lanes!");
25483 #endif
25484     Mask.resize(LaneElts);
25485   }
25486
25487   switch (N.getOpcode()) {
25488   case X86ISD::PSHUFD:
25489     return Mask;
25490   case X86ISD::PSHUFLW:
25491     Mask.resize(4);
25492     return Mask;
25493   case X86ISD::PSHUFHW:
25494     Mask.erase(Mask.begin(), Mask.begin() + 4);
25495     for (int &M : Mask)
25496       M -= 4;
25497     return Mask;
25498   default:
25499     llvm_unreachable("No valid shuffle instruction found!");
25500   }
25501 }
25502
25503 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
25504 ///
25505 /// We walk up the chain and look for a combinable shuffle, skipping over
25506 /// shuffles that we could hoist this shuffle's transformation past without
25507 /// altering anything.
25508 static SDValue
25509 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
25510                              SelectionDAG &DAG,
25511                              TargetLowering::DAGCombinerInfo &DCI) {
25512   assert(N.getOpcode() == X86ISD::PSHUFD &&
25513          "Called with something other than an x86 128-bit half shuffle!");
25514   SDLoc DL(N);
25515
25516   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
25517   // of the shuffles in the chain so that we can form a fresh chain to replace
25518   // this one.
25519   SmallVector<SDValue, 8> Chain;
25520   SDValue V = N.getOperand(0);
25521   for (; V.hasOneUse(); V = V.getOperand(0)) {
25522     switch (V.getOpcode()) {
25523     default:
25524       return SDValue(); // Nothing combined!
25525
25526     case ISD::BITCAST:
25527       // Skip bitcasts as we always know the type for the target specific
25528       // instructions.
25529       continue;
25530
25531     case X86ISD::PSHUFD:
25532       // Found another dword shuffle.
25533       break;
25534
25535     case X86ISD::PSHUFLW:
25536       // Check that the low words (being shuffled) are the identity in the
25537       // dword shuffle, and the high words are self-contained.
25538       if (Mask[0] != 0 || Mask[1] != 1 ||
25539           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
25540         return SDValue();
25541
25542       Chain.push_back(V);
25543       continue;
25544
25545     case X86ISD::PSHUFHW:
25546       // Check that the high words (being shuffled) are the identity in the
25547       // dword shuffle, and the low words are self-contained.
25548       if (Mask[2] != 2 || Mask[3] != 3 ||
25549           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
25550         return SDValue();
25551
25552       Chain.push_back(V);
25553       continue;
25554
25555     case X86ISD::UNPCKL:
25556     case X86ISD::UNPCKH:
25557       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
25558       // shuffle into a preceding word shuffle.
25559       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
25560           V.getSimpleValueType().getVectorElementType() != MVT::i16)
25561         return SDValue();
25562
25563       // Search for a half-shuffle which we can combine with.
25564       unsigned CombineOp =
25565           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
25566       if (V.getOperand(0) != V.getOperand(1) ||
25567           !V->isOnlyUserOf(V.getOperand(0).getNode()))
25568         return SDValue();
25569       Chain.push_back(V);
25570       V = V.getOperand(0);
25571       do {
25572         switch (V.getOpcode()) {
25573         default:
25574           return SDValue(); // Nothing to combine.
25575
25576         case X86ISD::PSHUFLW:
25577         case X86ISD::PSHUFHW:
25578           if (V.getOpcode() == CombineOp)
25579             break;
25580
25581           Chain.push_back(V);
25582
25583           // Fallthrough!
25584         case ISD::BITCAST:
25585           V = V.getOperand(0);
25586           continue;
25587         }
25588         break;
25589       } while (V.hasOneUse());
25590       break;
25591     }
25592     // Break out of the loop if we break out of the switch.
25593     break;
25594   }
25595
25596   if (!V.hasOneUse())
25597     // We fell out of the loop without finding a viable combining instruction.
25598     return SDValue();
25599
25600   // Merge this node's mask and our incoming mask.
25601   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25602   for (int &M : Mask)
25603     M = VMask[M];
25604   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
25605                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25606
25607   // Rebuild the chain around this new shuffle.
25608   while (!Chain.empty()) {
25609     SDValue W = Chain.pop_back_val();
25610
25611     if (V.getValueType() != W.getOperand(0).getValueType())
25612       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
25613
25614     switch (W.getOpcode()) {
25615     default:
25616       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
25617
25618     case X86ISD::UNPCKL:
25619     case X86ISD::UNPCKH:
25620       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
25621       break;
25622
25623     case X86ISD::PSHUFD:
25624     case X86ISD::PSHUFLW:
25625     case X86ISD::PSHUFHW:
25626       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
25627       break;
25628     }
25629   }
25630   if (V.getValueType() != N.getValueType())
25631     V = DAG.getBitcast(N.getValueType(), V);
25632
25633   // Return the new chain to replace N.
25634   return V;
25635 }
25636
25637 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
25638 /// pshufhw.
25639 ///
25640 /// We walk up the chain, skipping shuffles of the other half and looking
25641 /// through shuffles which switch halves trying to find a shuffle of the same
25642 /// pair of dwords.
25643 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
25644                                         SelectionDAG &DAG,
25645                                         TargetLowering::DAGCombinerInfo &DCI) {
25646   assert(
25647       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
25648       "Called with something other than an x86 128-bit half shuffle!");
25649   SDLoc DL(N);
25650   unsigned CombineOpcode = N.getOpcode();
25651
25652   // Walk up a single-use chain looking for a combinable shuffle.
25653   SDValue V = N.getOperand(0);
25654   for (; V.hasOneUse(); V = V.getOperand(0)) {
25655     switch (V.getOpcode()) {
25656     default:
25657       return false; // Nothing combined!
25658
25659     case ISD::BITCAST:
25660       // Skip bitcasts as we always know the type for the target specific
25661       // instructions.
25662       continue;
25663
25664     case X86ISD::PSHUFLW:
25665     case X86ISD::PSHUFHW:
25666       if (V.getOpcode() == CombineOpcode)
25667         break;
25668
25669       // Other-half shuffles are no-ops.
25670       continue;
25671     }
25672     // Break out of the loop if we break out of the switch.
25673     break;
25674   }
25675
25676   if (!V.hasOneUse())
25677     // We fell out of the loop without finding a viable combining instruction.
25678     return false;
25679
25680   // Combine away the bottom node as its shuffle will be accumulated into
25681   // a preceding shuffle.
25682   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25683
25684   // Record the old value.
25685   SDValue Old = V;
25686
25687   // Merge this node's mask and our incoming mask (adjusted to account for all
25688   // the pshufd instructions encountered).
25689   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25690   for (int &M : Mask)
25691     M = VMask[M];
25692   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
25693                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25694
25695   // Check that the shuffles didn't cancel each other out. If not, we need to
25696   // combine to the new one.
25697   if (Old != V)
25698     // Replace the combinable shuffle with the combined one, updating all users
25699     // so that we re-evaluate the chain here.
25700     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
25701
25702   return true;
25703 }
25704
25705 /// \brief Try to combine x86 target specific shuffles.
25706 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
25707                                     TargetLowering::DAGCombinerInfo &DCI,
25708                                     const X86Subtarget &Subtarget) {
25709   SDLoc DL(N);
25710   MVT VT = N.getSimpleValueType();
25711   SmallVector<int, 4> Mask;
25712
25713   switch (N.getOpcode()) {
25714   case X86ISD::PSHUFD:
25715   case X86ISD::PSHUFLW:
25716   case X86ISD::PSHUFHW:
25717     Mask = getPSHUFShuffleMask(N);
25718     assert(Mask.size() == 4);
25719     break;
25720   case X86ISD::UNPCKL: {
25721     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
25722     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
25723     // moves upper half elements into the lower half part. For example:
25724     //
25725     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
25726     //     undef:v16i8
25727     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
25728     //
25729     // will be combined to:
25730     //
25731     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
25732
25733     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
25734     // happen due to advanced instructions.
25735     if (!VT.is128BitVector())
25736       return SDValue();
25737
25738     auto Op0 = N.getOperand(0);
25739     auto Op1 = N.getOperand(1);
25740     if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
25741       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
25742
25743       unsigned NumElts = VT.getVectorNumElements();
25744       SmallVector<int, 8> ExpectedMask(NumElts, -1);
25745       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
25746                 NumElts / 2);
25747
25748       auto ShufOp = Op1.getOperand(0);
25749       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
25750         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
25751     }
25752     return SDValue();
25753   }
25754   case X86ISD::BLENDI: {
25755     SDValue V0 = N->getOperand(0);
25756     SDValue V1 = N->getOperand(1);
25757     assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
25758            "Unexpected input vector types");
25759
25760     // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
25761     // operands and changing the mask to 1. This saves us a bunch of
25762     // pattern-matching possibilities related to scalar math ops in SSE/AVX.
25763     // x86InstrInfo knows how to commute this back after instruction selection
25764     // if it would help register allocation.
25765
25766     // TODO: If optimizing for size or a processor that doesn't suffer from
25767     // partial register update stalls, this should be transformed into a MOVSD
25768     // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
25769
25770     if (VT == MVT::v2f64)
25771       if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
25772         if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
25773           SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
25774           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
25775         }
25776
25777     // Attempt to merge blend(insertps(x,y),zero).
25778     if (V0.getOpcode() == X86ISD::INSERTPS ||
25779         V1.getOpcode() == X86ISD::INSERTPS) {
25780       assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25781
25782       // Determine which elements are known to be zero.
25783       SmallVector<int, 8> TargetMask;
25784       SmallVector<SDValue, 2> BlendOps;
25785       if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
25786         return SDValue();
25787
25788       // Helper function to take inner insertps node and attempt to
25789       // merge the blend with zero into its zero mask.
25790       auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
25791         if (V.getOpcode() != X86ISD::INSERTPS)
25792           return SDValue();
25793         SDValue Op0 = V.getOperand(0);
25794         SDValue Op1 = V.getOperand(1);
25795         SDValue Op2 = V.getOperand(2);
25796         unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25797
25798         // Check each element of the blend node's target mask - must either
25799         // be zeroable (and update the zero mask) or selects the element from
25800         // the inner insertps node.
25801         for (int i = 0; i != 4; ++i)
25802           if (TargetMask[i] < 0)
25803             InsertPSMask |= (1u << i);
25804           else if (TargetMask[i] != (i + Offset))
25805             return SDValue();
25806         return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
25807                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
25808       };
25809
25810       if (SDValue V = MergeInsertPSAndBlend(V0, 0))
25811         return V;
25812       if (SDValue V = MergeInsertPSAndBlend(V1, 4))
25813         return V;
25814     }
25815     return SDValue();
25816   }
25817   case X86ISD::INSERTPS: {
25818     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25819     SDValue Op0 = N.getOperand(0);
25820     SDValue Op1 = N.getOperand(1);
25821     SDValue Op2 = N.getOperand(2);
25822     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25823     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
25824     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
25825     unsigned ZeroMask = InsertPSMask & 0xF;
25826
25827     // If we zero out all elements from Op0 then we don't need to reference it.
25828     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
25829       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
25830                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
25831
25832     // If we zero out the element from Op1 then we don't need to reference it.
25833     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
25834       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25835                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
25836
25837     // Attempt to merge insertps Op1 with an inner target shuffle node.
25838     SmallVector<int, 8> TargetMask1;
25839     SmallVector<SDValue, 2> Ops1;
25840     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
25841       int M = TargetMask1[SrcIdx];
25842       if (isUndefOrZero(M)) {
25843         // Zero/UNDEF insertion - zero out element and remove dependency.
25844         InsertPSMask |= (1u << DstIdx);
25845         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25846                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
25847       }
25848       // Update insertps mask srcidx and reference the source input directly.
25849       assert(0 <= M && M < 8 && "Shuffle index out of range");
25850       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
25851       Op1 = Ops1[M < 4 ? 0 : 1];
25852       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25853                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
25854     }
25855
25856     // Attempt to merge insertps Op0 with an inner target shuffle node.
25857     SmallVector<int, 8> TargetMask0;
25858     SmallVector<SDValue, 2> Ops0;
25859     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
25860       return SDValue();
25861
25862     bool Updated = false;
25863     bool UseInput00 = false;
25864     bool UseInput01 = false;
25865     for (int i = 0; i != 4; ++i) {
25866       int M = TargetMask0[i];
25867       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
25868         // No change if element is already zero or the inserted element.
25869         continue;
25870       } else if (isUndefOrZero(M)) {
25871         // If the target mask is undef/zero then we must zero the element.
25872         InsertPSMask |= (1u << i);
25873         Updated = true;
25874         continue;
25875       }
25876
25877       // The input vector element must be inline.
25878       if (M != i && M != (i + 4))
25879         return SDValue();
25880
25881       // Determine which inputs of the target shuffle we're using.
25882       UseInput00 |= (0 <= M && M < 4);
25883       UseInput01 |= (4 <= M);
25884     }
25885
25886     // If we're not using both inputs of the target shuffle then use the
25887     // referenced input directly.
25888     if (UseInput00 && !UseInput01) {
25889       Updated = true;
25890       Op0 = Ops0[0];
25891     } else if (!UseInput00 && UseInput01) {
25892       Updated = true;
25893       Op0 = Ops0[1];
25894     }
25895
25896     if (Updated)
25897       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25898                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
25899
25900     return SDValue();
25901   }
25902   default:
25903     return SDValue();
25904   }
25905
25906   // Nuke no-op shuffles that show up after combining.
25907   if (isNoopShuffleMask(Mask))
25908     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25909
25910   // Look for simplifications involving one or two shuffle instructions.
25911   SDValue V = N.getOperand(0);
25912   switch (N.getOpcode()) {
25913   default:
25914     break;
25915   case X86ISD::PSHUFLW:
25916   case X86ISD::PSHUFHW:
25917     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
25918
25919     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
25920       return SDValue(); // We combined away this shuffle, so we're done.
25921
25922     // See if this reduces to a PSHUFD which is no more expensive and can
25923     // combine with more operations. Note that it has to at least flip the
25924     // dwords as otherwise it would have been removed as a no-op.
25925     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
25926       int DMask[] = {0, 1, 2, 3};
25927       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
25928       DMask[DOffset + 0] = DOffset + 1;
25929       DMask[DOffset + 1] = DOffset + 0;
25930       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
25931       V = DAG.getBitcast(DVT, V);
25932       DCI.AddToWorklist(V.getNode());
25933       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
25934                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
25935       DCI.AddToWorklist(V.getNode());
25936       return DAG.getBitcast(VT, V);
25937     }
25938
25939     // Look for shuffle patterns which can be implemented as a single unpack.
25940     // FIXME: This doesn't handle the location of the PSHUFD generically, and
25941     // only works when we have a PSHUFD followed by two half-shuffles.
25942     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
25943         (V.getOpcode() == X86ISD::PSHUFLW ||
25944          V.getOpcode() == X86ISD::PSHUFHW) &&
25945         V.getOpcode() != N.getOpcode() &&
25946         V.hasOneUse()) {
25947       SDValue D = V.getOperand(0);
25948       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
25949         D = D.getOperand(0);
25950       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
25951         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25952         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
25953         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25954         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25955         int WordMask[8];
25956         for (int i = 0; i < 4; ++i) {
25957           WordMask[i + NOffset] = Mask[i] + NOffset;
25958           WordMask[i + VOffset] = VMask[i] + VOffset;
25959         }
25960         // Map the word mask through the DWord mask.
25961         int MappedMask[8];
25962         for (int i = 0; i < 8; ++i)
25963           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
25964         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
25965             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
25966           // We can replace all three shuffles with an unpack.
25967           V = DAG.getBitcast(VT, D.getOperand(0));
25968           DCI.AddToWorklist(V.getNode());
25969           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
25970                                                 : X86ISD::UNPCKH,
25971                              DL, VT, V, V);
25972         }
25973       }
25974     }
25975
25976     break;
25977
25978   case X86ISD::PSHUFD:
25979     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
25980       return NewN;
25981
25982     break;
25983   }
25984
25985   return SDValue();
25986 }
25987
25988 /// \brief Try to combine a shuffle into a target-specific add-sub node.
25989 ///
25990 /// We combine this directly on the abstract vector shuffle nodes so it is
25991 /// easier to generically match. We also insert dummy vector shuffle nodes for
25992 /// the operands which explicitly discard the lanes which are unused by this
25993 /// operation to try to flow through the rest of the combiner the fact that
25994 /// they're unused.
25995 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
25996                                       SelectionDAG &DAG) {
25997   SDLoc DL(N);
25998   EVT VT = N->getValueType(0);
25999   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
26000       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
26001     return SDValue();
26002
26003   // We only handle target-independent shuffles.
26004   // FIXME: It would be easy and harmless to use the target shuffle mask
26005   // extraction tool to support more.
26006   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
26007     return SDValue();
26008
26009   auto *SVN = cast<ShuffleVectorSDNode>(N);
26010   SmallVector<int, 8> Mask;
26011   for (int M : SVN->getMask())
26012     Mask.push_back(M);
26013
26014   SDValue V1 = N->getOperand(0);
26015   SDValue V2 = N->getOperand(1);
26016
26017   // We require the first shuffle operand to be the FSUB node, and the second to
26018   // be the FADD node.
26019   if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
26020     ShuffleVectorSDNode::commuteMask(Mask);
26021     std::swap(V1, V2);
26022   } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
26023     return SDValue();
26024
26025   // If there are other uses of these operations we can't fold them.
26026   if (!V1->hasOneUse() || !V2->hasOneUse())
26027     return SDValue();
26028
26029   // Ensure that both operations have the same operands. Note that we can
26030   // commute the FADD operands.
26031   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
26032   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
26033       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
26034     return SDValue();
26035
26036   // We're looking for blends between FADD and FSUB nodes. We insist on these
26037   // nodes being lined up in a specific expected pattern.
26038   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
26039         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
26040         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
26041     return SDValue();
26042
26043   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
26044 }
26045
26046 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
26047                               TargetLowering::DAGCombinerInfo &DCI,
26048                               const X86Subtarget &Subtarget) {
26049   SDLoc dl(N);
26050   EVT VT = N->getValueType(0);
26051
26052   // Don't create instructions with illegal types after legalize types has run.
26053   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26054   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
26055     return SDValue();
26056
26057   // If we have legalized the vector types, look for blends of FADD and FSUB
26058   // nodes that we can fuse into an ADDSUB node.
26059   if (TLI.isTypeLegal(VT))
26060     if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
26061       return AddSub;
26062
26063   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
26064   if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
26065       N->getOpcode() == ISD::VECTOR_SHUFFLE)
26066     return combineShuffle256(N, DAG, DCI, Subtarget);
26067
26068   // During Type Legalization, when promoting illegal vector types,
26069   // the backend might introduce new shuffle dag nodes and bitcasts.
26070   //
26071   // This code performs the following transformation:
26072   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
26073   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
26074   //
26075   // We do this only if both the bitcast and the BINOP dag nodes have
26076   // one use. Also, perform this transformation only if the new binary
26077   // operation is legal. This is to avoid introducing dag nodes that
26078   // potentially need to be further expanded (or custom lowered) into a
26079   // less optimal sequence of dag nodes.
26080   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
26081       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
26082       N->getOperand(0).getOpcode() == ISD::BITCAST &&
26083       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
26084     SDValue N0 = N->getOperand(0);
26085     SDValue N1 = N->getOperand(1);
26086
26087     SDValue BC0 = N0.getOperand(0);
26088     EVT SVT = BC0.getValueType();
26089     unsigned Opcode = BC0.getOpcode();
26090     unsigned NumElts = VT.getVectorNumElements();
26091
26092     if (BC0.hasOneUse() && SVT.isVector() &&
26093         SVT.getVectorNumElements() * 2 == NumElts &&
26094         TLI.isOperationLegal(Opcode, VT)) {
26095       bool CanFold = false;
26096       switch (Opcode) {
26097       default : break;
26098       case ISD::ADD :
26099       case ISD::FADD :
26100       case ISD::SUB :
26101       case ISD::FSUB :
26102       case ISD::MUL :
26103       case ISD::FMUL :
26104         CanFold = true;
26105       }
26106
26107       unsigned SVTNumElts = SVT.getVectorNumElements();
26108       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
26109       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
26110         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
26111       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
26112         CanFold = SVOp->getMaskElt(i) < 0;
26113
26114       if (CanFold) {
26115         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
26116         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
26117         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
26118         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
26119       }
26120     }
26121   }
26122
26123   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
26124   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
26125   // consecutive, non-overlapping, and in the right order.
26126   SmallVector<SDValue, 16> Elts;
26127   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
26128     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
26129
26130   if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
26131     return LD;
26132
26133   if (isTargetShuffle(N->getOpcode())) {
26134     if (SDValue Shuffle =
26135             combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
26136       return Shuffle;
26137
26138     // Try recursively combining arbitrary sequences of x86 shuffle
26139     // instructions into higher-order shuffles. We do this after combining
26140     // specific PSHUF instruction sequences into their minimal form so that we
26141     // can evaluate how many specialized shuffle instructions are involved in
26142     // a particular chain.
26143     SmallVector<int, 1> NonceMask; // Just a placeholder.
26144     NonceMask.push_back(0);
26145     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
26146                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
26147                                       DCI, Subtarget))
26148       return SDValue(); // This routine will use CombineTo to replace N.
26149   }
26150
26151   return SDValue();
26152 }
26153
26154 /// Check if a vector extract from a target-specific shuffle of a load can be
26155 /// folded into a single element load.
26156 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
26157 /// shuffles have been custom lowered so we need to handle those here.
26158 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
26159                                          TargetLowering::DAGCombinerInfo &DCI) {
26160   if (DCI.isBeforeLegalizeOps())
26161     return SDValue();
26162
26163   SDValue InVec = N->getOperand(0);
26164   SDValue EltNo = N->getOperand(1);
26165   EVT EltVT = N->getValueType(0);
26166
26167   if (!isa<ConstantSDNode>(EltNo))
26168     return SDValue();
26169
26170   EVT OriginalVT = InVec.getValueType();
26171
26172   if (InVec.getOpcode() == ISD::BITCAST) {
26173     // Don't duplicate a load with other uses.
26174     if (!InVec.hasOneUse())
26175       return SDValue();
26176     EVT BCVT = InVec.getOperand(0).getValueType();
26177     if (!BCVT.isVector() ||
26178         BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
26179       return SDValue();
26180     InVec = InVec.getOperand(0);
26181   }
26182
26183   EVT CurrentVT = InVec.getValueType();
26184
26185   if (!isTargetShuffle(InVec.getOpcode()))
26186     return SDValue();
26187
26188   // Don't duplicate a load with other uses.
26189   if (!InVec.hasOneUse())
26190     return SDValue();
26191
26192   SmallVector<int, 16> ShuffleMask;
26193   SmallVector<SDValue, 2> ShuffleOps;
26194   bool UnaryShuffle;
26195   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
26196                             ShuffleOps, ShuffleMask, UnaryShuffle))
26197     return SDValue();
26198
26199   // Select the input vector, guarding against out of range extract vector.
26200   unsigned NumElems = CurrentVT.getVectorNumElements();
26201   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
26202   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
26203
26204   if (Idx == SM_SentinelZero)
26205     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
26206                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
26207   if (Idx == SM_SentinelUndef)
26208     return DAG.getUNDEF(EltVT);
26209
26210   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
26211   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
26212                                          : ShuffleOps[1];
26213
26214   // If inputs to shuffle are the same for both ops, then allow 2 uses
26215   unsigned AllowedUses =
26216       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
26217
26218   if (LdNode.getOpcode() == ISD::BITCAST) {
26219     // Don't duplicate a load with other uses.
26220     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
26221       return SDValue();
26222
26223     AllowedUses = 1; // only allow 1 load use if we have a bitcast
26224     LdNode = LdNode.getOperand(0);
26225   }
26226
26227   if (!ISD::isNormalLoad(LdNode.getNode()))
26228     return SDValue();
26229
26230   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
26231
26232   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
26233     return SDValue();
26234
26235   // If there's a bitcast before the shuffle, check if the load type and
26236   // alignment is valid.
26237   unsigned Align = LN0->getAlignment();
26238   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26239   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
26240       EltVT.getTypeForEVT(*DAG.getContext()));
26241
26242   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
26243     return SDValue();
26244
26245   // All checks match so transform back to vector_shuffle so that DAG combiner
26246   // can finish the job
26247   SDLoc dl(N);
26248
26249   // Create shuffle node taking into account the case that its a unary shuffle
26250   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
26251   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
26252                                  ShuffleMask);
26253   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
26254   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
26255                      EltNo);
26256 }
26257
26258 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
26259                               const X86Subtarget &Subtarget) {
26260   SDValue N0 = N->getOperand(0);
26261   EVT VT = N->getValueType(0);
26262
26263   // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
26264   // special and don't usually play with other vector types, it's better to
26265   // handle them early to be sure we emit efficient code by avoiding
26266   // store-load conversions.
26267   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
26268       N0.getValueType() == MVT::v2i32 &&
26269       isNullConstant(N0.getOperand(1))) {
26270     SDValue N00 = N0->getOperand(0);
26271     if (N00.getValueType() == MVT::i32)
26272       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
26273   }
26274
26275   // Convert a bitcasted integer logic operation that has one bitcasted
26276   // floating-point operand and one constant operand into a floating-point
26277   // logic operation. This may create a load of the constant, but that is
26278   // cheaper than materializing the constant in an integer register and
26279   // transferring it to an SSE register or transferring the SSE operand to
26280   // integer register and back.
26281   unsigned FPOpcode;
26282   switch (N0.getOpcode()) {
26283     case ISD::AND: FPOpcode = X86ISD::FAND; break;
26284     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
26285     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
26286     default: return SDValue();
26287   }
26288   if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
26289        (Subtarget.hasSSE2() && VT == MVT::f64)) &&
26290       isa<ConstantSDNode>(N0.getOperand(1)) &&
26291       N0.getOperand(0).getOpcode() == ISD::BITCAST &&
26292       N0.getOperand(0).getOperand(0).getValueType() == VT) {
26293     SDValue N000 = N0.getOperand(0).getOperand(0);
26294     SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
26295     return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
26296   }
26297
26298   return SDValue();
26299 }
26300
26301 /// Detect vector gather/scatter index generation and convert it from being a
26302 /// bunch of shuffles and extracts into a somewhat faster sequence.
26303 /// For i686, the best sequence is apparently storing the value and loading
26304 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
26305 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
26306                                        TargetLowering::DAGCombinerInfo &DCI) {
26307   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
26308     return NewOp;
26309
26310   SDValue InputVector = N->getOperand(0);
26311   SDLoc dl(InputVector);
26312   // Detect mmx to i32 conversion through a v2i32 elt extract.
26313   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
26314       N->getValueType(0) == MVT::i32 &&
26315       InputVector.getValueType() == MVT::v2i32 &&
26316       isa<ConstantSDNode>(N->getOperand(1)) &&
26317       N->getConstantOperandVal(1) == 0) {
26318     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
26319
26320     // The bitcast source is a direct mmx result.
26321     if (MMXSrc.getValueType() == MVT::x86mmx)
26322       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
26323   }
26324
26325   EVT VT = N->getValueType(0);
26326
26327   if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
26328       InputVector.getOpcode() == ISD::BITCAST &&
26329       isa<ConstantSDNode>(InputVector.getOperand(0))) {
26330     uint64_t ExtractedElt =
26331         cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
26332     uint64_t InputValue =
26333         cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
26334     uint64_t Res = (InputValue >> ExtractedElt) & 1;
26335     return DAG.getConstant(Res, dl, MVT::i1);
26336   }
26337   // Only operate on vectors of 4 elements, where the alternative shuffling
26338   // gets to be more expensive.
26339   if (InputVector.getValueType() != MVT::v4i32)
26340     return SDValue();
26341
26342   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
26343   // single use which is a sign-extend or zero-extend, and all elements are
26344   // used.
26345   SmallVector<SDNode *, 4> Uses;
26346   unsigned ExtractedElements = 0;
26347   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
26348        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
26349     if (UI.getUse().getResNo() != InputVector.getResNo())
26350       return SDValue();
26351
26352     SDNode *Extract = *UI;
26353     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26354       return SDValue();
26355
26356     if (Extract->getValueType(0) != MVT::i32)
26357       return SDValue();
26358     if (!Extract->hasOneUse())
26359       return SDValue();
26360     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
26361         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
26362       return SDValue();
26363     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
26364       return SDValue();
26365
26366     // Record which element was extracted.
26367     ExtractedElements |=
26368       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
26369
26370     Uses.push_back(Extract);
26371   }
26372
26373   // If not all the elements were used, this may not be worthwhile.
26374   if (ExtractedElements != 15)
26375     return SDValue();
26376
26377   // Ok, we've now decided to do the transformation.
26378   // If 64-bit shifts are legal, use the extract-shift sequence,
26379   // otherwise bounce the vector off the cache.
26380   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26381   SDValue Vals[4];
26382
26383   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
26384     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
26385     auto &DL = DAG.getDataLayout();
26386     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
26387     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26388       DAG.getConstant(0, dl, VecIdxTy));
26389     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26390       DAG.getConstant(1, dl, VecIdxTy));
26391
26392     SDValue ShAmt = DAG.getConstant(
26393         32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
26394     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
26395     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26396       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
26397     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
26398     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26399       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
26400   } else {
26401     // Store the value to a temporary stack slot.
26402     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
26403     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
26404                               MachinePointerInfo());
26405
26406     EVT ElementType = InputVector.getValueType().getVectorElementType();
26407     unsigned EltSize = ElementType.getSizeInBits() / 8;
26408
26409     // Replace each use (extract) with a load of the appropriate element.
26410     for (unsigned i = 0; i < 4; ++i) {
26411       uint64_t Offset = EltSize * i;
26412       auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26413       SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
26414
26415       SDValue ScalarAddr =
26416           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
26417
26418       // Load the scalar.
26419       Vals[i] =
26420           DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
26421     }
26422   }
26423
26424   // Replace the extracts
26425   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
26426     UE = Uses.end(); UI != UE; ++UI) {
26427     SDNode *Extract = *UI;
26428
26429     SDValue Idx = Extract->getOperand(1);
26430     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
26431     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
26432   }
26433
26434   // The replacement was made in place; don't return anything.
26435   return SDValue();
26436 }
26437
26438 /// Do target-specific dag combines on SELECT and VSELECT nodes.
26439 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
26440                              TargetLowering::DAGCombinerInfo &DCI,
26441                              const X86Subtarget &Subtarget) {
26442   SDLoc DL(N);
26443   SDValue Cond = N->getOperand(0);
26444   // Get the LHS/RHS of the select.
26445   SDValue LHS = N->getOperand(1);
26446   SDValue RHS = N->getOperand(2);
26447   EVT VT = LHS.getValueType();
26448   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26449
26450   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
26451   // instructions match the semantics of the common C idiom x<y?x:y but not
26452   // x<=y?x:y, because of how they handle negative zero (which can be
26453   // ignored in unsafe-math mode).
26454   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
26455   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
26456       VT != MVT::f80 && VT != MVT::f128 &&
26457       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
26458       (Subtarget.hasSSE2() ||
26459        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
26460     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26461
26462     unsigned Opcode = 0;
26463     // Check for x CC y ? x : y.
26464     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26465         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26466       switch (CC) {
26467       default: break;
26468       case ISD::SETULT:
26469         // Converting this to a min would handle NaNs incorrectly, and swapping
26470         // the operands would cause it to handle comparisons between positive
26471         // and negative zero incorrectly.
26472         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26473           if (!DAG.getTarget().Options.UnsafeFPMath &&
26474               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26475             break;
26476           std::swap(LHS, RHS);
26477         }
26478         Opcode = X86ISD::FMIN;
26479         break;
26480       case ISD::SETOLE:
26481         // Converting this to a min would handle comparisons between positive
26482         // and negative zero incorrectly.
26483         if (!DAG.getTarget().Options.UnsafeFPMath &&
26484             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26485           break;
26486         Opcode = X86ISD::FMIN;
26487         break;
26488       case ISD::SETULE:
26489         // Converting this to a min would handle both negative zeros and NaNs
26490         // incorrectly, but we can swap the operands to fix both.
26491         std::swap(LHS, RHS);
26492       case ISD::SETOLT:
26493       case ISD::SETLT:
26494       case ISD::SETLE:
26495         Opcode = X86ISD::FMIN;
26496         break;
26497
26498       case ISD::SETOGE:
26499         // Converting this to a max would handle comparisons between positive
26500         // and negative zero incorrectly.
26501         if (!DAG.getTarget().Options.UnsafeFPMath &&
26502             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26503           break;
26504         Opcode = X86ISD::FMAX;
26505         break;
26506       case ISD::SETUGT:
26507         // Converting this to a max would handle NaNs incorrectly, and swapping
26508         // the operands would cause it to handle comparisons between positive
26509         // and negative zero incorrectly.
26510         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26511           if (!DAG.getTarget().Options.UnsafeFPMath &&
26512               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26513             break;
26514           std::swap(LHS, RHS);
26515         }
26516         Opcode = X86ISD::FMAX;
26517         break;
26518       case ISD::SETUGE:
26519         // Converting this to a max would handle both negative zeros and NaNs
26520         // incorrectly, but we can swap the operands to fix both.
26521         std::swap(LHS, RHS);
26522       case ISD::SETOGT:
26523       case ISD::SETGT:
26524       case ISD::SETGE:
26525         Opcode = X86ISD::FMAX;
26526         break;
26527       }
26528     // Check for x CC y ? y : x -- a min/max with reversed arms.
26529     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
26530                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
26531       switch (CC) {
26532       default: break;
26533       case ISD::SETOGE:
26534         // Converting this to a min would handle comparisons between positive
26535         // and negative zero incorrectly, and swapping the operands would
26536         // cause it to handle NaNs incorrectly.
26537         if (!DAG.getTarget().Options.UnsafeFPMath &&
26538             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
26539           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26540             break;
26541           std::swap(LHS, RHS);
26542         }
26543         Opcode = X86ISD::FMIN;
26544         break;
26545       case ISD::SETUGT:
26546         // Converting this to a min would handle NaNs incorrectly.
26547         if (!DAG.getTarget().Options.UnsafeFPMath &&
26548             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
26549           break;
26550         Opcode = X86ISD::FMIN;
26551         break;
26552       case ISD::SETUGE:
26553         // Converting this to a min would handle both negative zeros and NaNs
26554         // incorrectly, but we can swap the operands to fix both.
26555         std::swap(LHS, RHS);
26556       case ISD::SETOGT:
26557       case ISD::SETGT:
26558       case ISD::SETGE:
26559         Opcode = X86ISD::FMIN;
26560         break;
26561
26562       case ISD::SETULT:
26563         // Converting this to a max would handle NaNs incorrectly.
26564         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26565           break;
26566         Opcode = X86ISD::FMAX;
26567         break;
26568       case ISD::SETOLE:
26569         // Converting this to a max would handle comparisons between positive
26570         // and negative zero incorrectly, and swapping the operands would
26571         // cause it to handle NaNs incorrectly.
26572         if (!DAG.getTarget().Options.UnsafeFPMath &&
26573             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
26574           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26575             break;
26576           std::swap(LHS, RHS);
26577         }
26578         Opcode = X86ISD::FMAX;
26579         break;
26580       case ISD::SETULE:
26581         // Converting this to a max would handle both negative zeros and NaNs
26582         // incorrectly, but we can swap the operands to fix both.
26583         std::swap(LHS, RHS);
26584       case ISD::SETOLT:
26585       case ISD::SETLT:
26586       case ISD::SETLE:
26587         Opcode = X86ISD::FMAX;
26588         break;
26589       }
26590     }
26591
26592     if (Opcode)
26593       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
26594   }
26595
26596   EVT CondVT = Cond.getValueType();
26597   if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
26598       CondVT.getVectorElementType() == MVT::i1) {
26599     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
26600     // lowering on KNL. In this case we convert it to
26601     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
26602     // The same situation for all 128 and 256-bit vectors of i8 and i16.
26603     // Since SKX these selects have a proper lowering.
26604     EVT OpVT = LHS.getValueType();
26605     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
26606         (OpVT.getVectorElementType() == MVT::i8 ||
26607          OpVT.getVectorElementType() == MVT::i16) &&
26608         !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
26609       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
26610       DCI.AddToWorklist(Cond.getNode());
26611       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
26612     }
26613   }
26614   // If this is a select between two integer constants, try to do some
26615   // optimizations.
26616   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
26617     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
26618       // Don't do this for crazy integer types.
26619       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
26620         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
26621         // so that TrueC (the true value) is larger than FalseC.
26622         bool NeedsCondInvert = false;
26623
26624         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
26625             // Efficiently invertible.
26626             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
26627              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
26628               isa<ConstantSDNode>(Cond.getOperand(1))))) {
26629           NeedsCondInvert = true;
26630           std::swap(TrueC, FalseC);
26631         }
26632
26633         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
26634         if (FalseC->getAPIntValue() == 0 &&
26635             TrueC->getAPIntValue().isPowerOf2()) {
26636           if (NeedsCondInvert) // Invert the condition if needed.
26637             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26638                                DAG.getConstant(1, DL, Cond.getValueType()));
26639
26640           // Zero extend the condition if needed.
26641           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
26642
26643           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
26644           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
26645                              DAG.getConstant(ShAmt, DL, MVT::i8));
26646         }
26647
26648         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
26649         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
26650           if (NeedsCondInvert) // Invert the condition if needed.
26651             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26652                                DAG.getConstant(1, DL, Cond.getValueType()));
26653
26654           // Zero extend the condition if needed.
26655           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
26656                              FalseC->getValueType(0), Cond);
26657           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26658                              SDValue(FalseC, 0));
26659         }
26660
26661         // Optimize cases that will turn into an LEA instruction.  This requires
26662         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
26663         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
26664           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
26665           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
26666
26667           bool isFastMultiplier = false;
26668           if (Diff < 10) {
26669             switch ((unsigned char)Diff) {
26670               default: break;
26671               case 1:  // result = add base, cond
26672               case 2:  // result = lea base(    , cond*2)
26673               case 3:  // result = lea base(cond, cond*2)
26674               case 4:  // result = lea base(    , cond*4)
26675               case 5:  // result = lea base(cond, cond*4)
26676               case 8:  // result = lea base(    , cond*8)
26677               case 9:  // result = lea base(cond, cond*8)
26678                 isFastMultiplier = true;
26679                 break;
26680             }
26681           }
26682
26683           if (isFastMultiplier) {
26684             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
26685             if (NeedsCondInvert) // Invert the condition if needed.
26686               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26687                                  DAG.getConstant(1, DL, Cond.getValueType()));
26688
26689             // Zero extend the condition if needed.
26690             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
26691                                Cond);
26692             // Scale the condition by the difference.
26693             if (Diff != 1)
26694               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
26695                                  DAG.getConstant(Diff, DL,
26696                                                  Cond.getValueType()));
26697
26698             // Add the base if non-zero.
26699             if (FalseC->getAPIntValue() != 0)
26700               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26701                                  SDValue(FalseC, 0));
26702             return Cond;
26703           }
26704         }
26705       }
26706   }
26707
26708   // Canonicalize max and min:
26709   // (x > y) ? x : y -> (x >= y) ? x : y
26710   // (x < y) ? x : y -> (x <= y) ? x : y
26711   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
26712   // the need for an extra compare
26713   // against zero. e.g.
26714   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
26715   // subl   %esi, %edi
26716   // testl  %edi, %edi
26717   // movl   $0, %eax
26718   // cmovgl %edi, %eax
26719   // =>
26720   // xorl   %eax, %eax
26721   // subl   %esi, $edi
26722   // cmovsl %eax, %edi
26723   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
26724       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26725       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26726     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26727     switch (CC) {
26728     default: break;
26729     case ISD::SETLT:
26730     case ISD::SETGT: {
26731       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
26732       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
26733                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
26734       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
26735     }
26736     }
26737   }
26738
26739   // Early exit check
26740   if (!TLI.isTypeLegal(VT))
26741     return SDValue();
26742
26743   // Match VSELECTs into subs with unsigned saturation.
26744   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
26745       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
26746       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
26747        (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
26748     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26749
26750     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
26751     // left side invert the predicate to simplify logic below.
26752     SDValue Other;
26753     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
26754       Other = RHS;
26755       CC = ISD::getSetCCInverse(CC, true);
26756     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
26757       Other = LHS;
26758     }
26759
26760     if (Other.getNode() && Other->getNumOperands() == 2 &&
26761         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
26762       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
26763       SDValue CondRHS = Cond->getOperand(1);
26764
26765       // Look for a general sub with unsigned saturation first.
26766       // x >= y ? x-y : 0 --> subus x, y
26767       // x >  y ? x-y : 0 --> subus x, y
26768       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
26769           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
26770         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
26771
26772       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
26773         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
26774           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
26775             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
26776               // If the RHS is a constant we have to reverse the const
26777               // canonicalization.
26778               // x > C-1 ? x+-C : 0 --> subus x, C
26779               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
26780                   CondRHSConst->getAPIntValue() ==
26781                       (-OpRHSConst->getAPIntValue() - 1))
26782                 return DAG.getNode(
26783                     X86ISD::SUBUS, DL, VT, OpLHS,
26784                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
26785
26786           // Another special case: If C was a sign bit, the sub has been
26787           // canonicalized into a xor.
26788           // FIXME: Would it be better to use computeKnownBits to determine
26789           //        whether it's safe to decanonicalize the xor?
26790           // x s< 0 ? x^C : 0 --> subus x, C
26791           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
26792               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
26793               OpRHSConst->getAPIntValue().isSignBit())
26794             // Note that we have to rebuild the RHS constant here to ensure we
26795             // don't rely on particular values of undef lanes.
26796             return DAG.getNode(
26797                 X86ISD::SUBUS, DL, VT, OpLHS,
26798                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
26799         }
26800     }
26801   }
26802
26803   // Simplify vector selection if condition value type matches vselect
26804   // operand type
26805   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
26806     assert(Cond.getValueType().isVector() &&
26807            "vector select expects a vector selector!");
26808
26809     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
26810     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
26811
26812     // Try invert the condition if true value is not all 1s and false value
26813     // is not all 0s.
26814     if (!TValIsAllOnes && !FValIsAllZeros &&
26815         // Check if the selector will be produced by CMPP*/PCMP*
26816         Cond.getOpcode() == ISD::SETCC &&
26817         // Check if SETCC has already been promoted
26818         TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
26819             CondVT) {
26820       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
26821       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
26822
26823       if (TValIsAllZeros || FValIsAllOnes) {
26824         SDValue CC = Cond.getOperand(2);
26825         ISD::CondCode NewCC =
26826           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
26827                                Cond.getOperand(0).getValueType().isInteger());
26828         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
26829         std::swap(LHS, RHS);
26830         TValIsAllOnes = FValIsAllOnes;
26831         FValIsAllZeros = TValIsAllZeros;
26832       }
26833     }
26834
26835     if (TValIsAllOnes || FValIsAllZeros) {
26836       SDValue Ret;
26837
26838       if (TValIsAllOnes && FValIsAllZeros)
26839         Ret = Cond;
26840       else if (TValIsAllOnes)
26841         Ret =
26842             DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
26843       else if (FValIsAllZeros)
26844         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
26845                           DAG.getBitcast(CondVT, LHS));
26846
26847       return DAG.getBitcast(VT, Ret);
26848     }
26849   }
26850
26851   // If this is a *dynamic* select (non-constant condition) and we can match
26852   // this node with one of the variable blend instructions, restructure the
26853   // condition so that the blends can use the high bit of each element and use
26854   // SimplifyDemandedBits to simplify the condition operand.
26855   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
26856       !DCI.isBeforeLegalize() &&
26857       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
26858     unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
26859
26860     // Don't optimize vector selects that map to mask-registers.
26861     if (BitWidth == 1)
26862       return SDValue();
26863
26864     // We can only handle the cases where VSELECT is directly legal on the
26865     // subtarget. We custom lower VSELECT nodes with constant conditions and
26866     // this makes it hard to see whether a dynamic VSELECT will correctly
26867     // lower, so we both check the operation's status and explicitly handle the
26868     // cases where a *dynamic* blend will fail even though a constant-condition
26869     // blend could be custom lowered.
26870     // FIXME: We should find a better way to handle this class of problems.
26871     // Potentially, we should combine constant-condition vselect nodes
26872     // pre-legalization into shuffles and not mark as many types as custom
26873     // lowered.
26874     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
26875       return SDValue();
26876     // FIXME: We don't support i16-element blends currently. We could and
26877     // should support them by making *all* the bits in the condition be set
26878     // rather than just the high bit and using an i8-element blend.
26879     if (VT.getVectorElementType() == MVT::i16)
26880       return SDValue();
26881     // Dynamic blending was only available from SSE4.1 onward.
26882     if (VT.is128BitVector() && !Subtarget.hasSSE41())
26883       return SDValue();
26884     // Byte blends are only available in AVX2
26885     if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
26886       return SDValue();
26887
26888     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
26889     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
26890
26891     APInt KnownZero, KnownOne;
26892     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
26893                                           DCI.isBeforeLegalizeOps());
26894     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
26895         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
26896                                  TLO)) {
26897       // If we changed the computation somewhere in the DAG, this change
26898       // will affect all users of Cond.
26899       // Make sure it is fine and update all the nodes so that we do not
26900       // use the generic VSELECT anymore. Otherwise, we may perform
26901       // wrong optimizations as we messed up with the actual expectation
26902       // for the vector boolean values.
26903       if (Cond != TLO.Old) {
26904         // Check all uses of that condition operand to check whether it will be
26905         // consumed by non-BLEND instructions, which may depend on all bits are
26906         // set properly.
26907         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26908              I != E; ++I)
26909           if (I->getOpcode() != ISD::VSELECT)
26910             // TODO: Add other opcodes eventually lowered into BLEND.
26911             return SDValue();
26912
26913         // Update all the users of the condition, before committing the change,
26914         // so that the VSELECT optimizations that expect the correct vector
26915         // boolean value will not be triggered.
26916         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26917              I != E; ++I)
26918           DAG.ReplaceAllUsesOfValueWith(
26919               SDValue(*I, 0),
26920               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
26921                           Cond, I->getOperand(1), I->getOperand(2)));
26922         DCI.CommitTargetLoweringOpt(TLO);
26923         return SDValue();
26924       }
26925       // At this point, only Cond is changed. Change the condition
26926       // just for N to keep the opportunity to optimize all other
26927       // users their own way.
26928       DAG.ReplaceAllUsesOfValueWith(
26929           SDValue(N, 0),
26930           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
26931                       TLO.New, N->getOperand(1), N->getOperand(2)));
26932       return SDValue();
26933     }
26934   }
26935
26936   return SDValue();
26937 }
26938
26939 /// Combine:
26940 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
26941 /// to:
26942 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
26943 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
26944 /// Note that this is only legal for some op/cc combinations.
26945 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
26946                                        SelectionDAG &DAG) {
26947   // This combine only operates on CMP-like nodes.
26948   if (!(Cmp.getOpcode() == X86ISD::CMP ||
26949         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
26950     return SDValue();
26951
26952   // This only applies to variations of the common case:
26953   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
26954   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
26955   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
26956   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
26957   // Using the proper condcodes (see below), overflow is checked for.
26958
26959   // FIXME: We can generalize both constraints:
26960   // - XOR/OR/AND (if they were made to survive AtomicExpand)
26961   // - LHS != 1
26962   // if the result is compared.
26963
26964   SDValue CmpLHS = Cmp.getOperand(0);
26965   SDValue CmpRHS = Cmp.getOperand(1);
26966
26967   if (!CmpLHS.hasOneUse())
26968     return SDValue();
26969
26970   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
26971   if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
26972     return SDValue();
26973
26974   const unsigned Opc = CmpLHS.getOpcode();
26975
26976   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
26977     return SDValue();
26978
26979   SDValue OpRHS = CmpLHS.getOperand(2);
26980   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
26981   if (!OpRHSC)
26982     return SDValue();
26983
26984   APInt Addend = OpRHSC->getAPIntValue();
26985   if (Opc == ISD::ATOMIC_LOAD_SUB)
26986     Addend = -Addend;
26987
26988   if (CC == X86::COND_S && Addend == 1)
26989     CC = X86::COND_LE;
26990   else if (CC == X86::COND_NS && Addend == 1)
26991     CC = X86::COND_G;
26992   else if (CC == X86::COND_G && Addend == -1)
26993     CC = X86::COND_GE;
26994   else if (CC == X86::COND_LE && Addend == -1)
26995     CC = X86::COND_L;
26996   else
26997     return SDValue();
26998
26999   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
27000   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
27001                                 DAG.getUNDEF(CmpLHS.getValueType()));
27002   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
27003   return LockOp;
27004 }
27005
27006 // Check whether a boolean test is testing a boolean value generated by
27007 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
27008 // code.
27009 //
27010 // Simplify the following patterns:
27011 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
27012 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
27013 // to (Op EFLAGS Cond)
27014 //
27015 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
27016 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
27017 // to (Op EFLAGS !Cond)
27018 //
27019 // where Op could be BRCOND or CMOV.
27020 //
27021 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
27022   // This combine only operates on CMP-like nodes.
27023   if (!(Cmp.getOpcode() == X86ISD::CMP ||
27024         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
27025     return SDValue();
27026
27027   // Quit if not used as a boolean value.
27028   if (CC != X86::COND_E && CC != X86::COND_NE)
27029     return SDValue();
27030
27031   // Check CMP operands. One of them should be 0 or 1 and the other should be
27032   // an SetCC or extended from it.
27033   SDValue Op1 = Cmp.getOperand(0);
27034   SDValue Op2 = Cmp.getOperand(1);
27035
27036   SDValue SetCC;
27037   const ConstantSDNode* C = nullptr;
27038   bool needOppositeCond = (CC == X86::COND_E);
27039   bool checkAgainstTrue = false; // Is it a comparison against 1?
27040
27041   if ((C = dyn_cast<ConstantSDNode>(Op1)))
27042     SetCC = Op2;
27043   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
27044     SetCC = Op1;
27045   else // Quit if all operands are not constants.
27046     return SDValue();
27047
27048   if (C->getZExtValue() == 1) {
27049     needOppositeCond = !needOppositeCond;
27050     checkAgainstTrue = true;
27051   } else if (C->getZExtValue() != 0)
27052     // Quit if the constant is neither 0 or 1.
27053     return SDValue();
27054
27055   bool truncatedToBoolWithAnd = false;
27056   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
27057   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
27058          SetCC.getOpcode() == ISD::TRUNCATE ||
27059          SetCC.getOpcode() == ISD::AssertZext ||
27060          SetCC.getOpcode() == ISD::AND) {
27061     if (SetCC.getOpcode() == ISD::AND) {
27062       int OpIdx = -1;
27063       if (isOneConstant(SetCC.getOperand(0)))
27064         OpIdx = 1;
27065       if (isOneConstant(SetCC.getOperand(1)))
27066         OpIdx = 0;
27067       if (OpIdx < 0)
27068         break;
27069       SetCC = SetCC.getOperand(OpIdx);
27070       truncatedToBoolWithAnd = true;
27071     } else
27072       SetCC = SetCC.getOperand(0);
27073   }
27074
27075   switch (SetCC.getOpcode()) {
27076   case X86ISD::SETCC_CARRY:
27077     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
27078     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
27079     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
27080     // truncated to i1 using 'and'.
27081     if (checkAgainstTrue && !truncatedToBoolWithAnd)
27082       break;
27083     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
27084            "Invalid use of SETCC_CARRY!");
27085     // FALL THROUGH
27086   case X86ISD::SETCC:
27087     // Set the condition code or opposite one if necessary.
27088     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
27089     if (needOppositeCond)
27090       CC = X86::GetOppositeBranchCondition(CC);
27091     return SetCC.getOperand(1);
27092   case X86ISD::CMOV: {
27093     // Check whether false/true value has canonical one, i.e. 0 or 1.
27094     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
27095     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
27096     // Quit if true value is not a constant.
27097     if (!TVal)
27098       return SDValue();
27099     // Quit if false value is not a constant.
27100     if (!FVal) {
27101       SDValue Op = SetCC.getOperand(0);
27102       // Skip 'zext' or 'trunc' node.
27103       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
27104           Op.getOpcode() == ISD::TRUNCATE)
27105         Op = Op.getOperand(0);
27106       // A special case for rdrand/rdseed, where 0 is set if false cond is
27107       // found.
27108       if ((Op.getOpcode() != X86ISD::RDRAND &&
27109            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
27110         return SDValue();
27111     }
27112     // Quit if false value is not the constant 0 or 1.
27113     bool FValIsFalse = true;
27114     if (FVal && FVal->getZExtValue() != 0) {
27115       if (FVal->getZExtValue() != 1)
27116         return SDValue();
27117       // If FVal is 1, opposite cond is needed.
27118       needOppositeCond = !needOppositeCond;
27119       FValIsFalse = false;
27120     }
27121     // Quit if TVal is not the constant opposite of FVal.
27122     if (FValIsFalse && TVal->getZExtValue() != 1)
27123       return SDValue();
27124     if (!FValIsFalse && TVal->getZExtValue() != 0)
27125       return SDValue();
27126     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
27127     if (needOppositeCond)
27128       CC = X86::GetOppositeBranchCondition(CC);
27129     return SetCC.getOperand(3);
27130   }
27131   }
27132
27133   return SDValue();
27134 }
27135
27136 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
27137 /// Match:
27138 ///   (X86or (X86setcc) (X86setcc))
27139 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
27140 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
27141                                            X86::CondCode &CC1, SDValue &Flags,
27142                                            bool &isAnd) {
27143   if (Cond->getOpcode() == X86ISD::CMP) {
27144     if (!isNullConstant(Cond->getOperand(1)))
27145       return false;
27146
27147     Cond = Cond->getOperand(0);
27148   }
27149
27150   isAnd = false;
27151
27152   SDValue SetCC0, SetCC1;
27153   switch (Cond->getOpcode()) {
27154   default: return false;
27155   case ISD::AND:
27156   case X86ISD::AND:
27157     isAnd = true;
27158     // fallthru
27159   case ISD::OR:
27160   case X86ISD::OR:
27161     SetCC0 = Cond->getOperand(0);
27162     SetCC1 = Cond->getOperand(1);
27163     break;
27164   };
27165
27166   // Make sure we have SETCC nodes, using the same flags value.
27167   if (SetCC0.getOpcode() != X86ISD::SETCC ||
27168       SetCC1.getOpcode() != X86ISD::SETCC ||
27169       SetCC0->getOperand(1) != SetCC1->getOperand(1))
27170     return false;
27171
27172   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
27173   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
27174   Flags = SetCC0->getOperand(1);
27175   return true;
27176 }
27177
27178 /// Optimize an EFLAGS definition used according to the condition code \p CC
27179 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
27180 /// uses of chain values.
27181 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
27182                                   SelectionDAG &DAG) {
27183   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
27184     return R;
27185   return combineSetCCAtomicArith(EFLAGS, CC, DAG);
27186 }
27187
27188 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
27189 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
27190                            TargetLowering::DAGCombinerInfo &DCI,
27191                            const X86Subtarget &Subtarget) {
27192   SDLoc DL(N);
27193
27194   // If the flag operand isn't dead, don't touch this CMOV.
27195   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
27196     return SDValue();
27197
27198   SDValue FalseOp = N->getOperand(0);
27199   SDValue TrueOp = N->getOperand(1);
27200   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
27201   SDValue Cond = N->getOperand(3);
27202
27203   if (CC == X86::COND_E || CC == X86::COND_NE) {
27204     switch (Cond.getOpcode()) {
27205     default: break;
27206     case X86ISD::BSR:
27207     case X86ISD::BSF:
27208       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
27209       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
27210         return (CC == X86::COND_E) ? FalseOp : TrueOp;
27211     }
27212   }
27213
27214   // Try to simplify the EFLAGS and condition code operands.
27215   // We can't always do this as FCMOV only supports a subset of X86 cond.
27216   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
27217     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
27218       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
27219         Flags};
27220       return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27221     }
27222   }
27223
27224   // If this is a select between two integer constants, try to do some
27225   // optimizations.  Note that the operands are ordered the opposite of SELECT
27226   // operands.
27227   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
27228     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
27229       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
27230       // larger than FalseC (the false value).
27231       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
27232         CC = X86::GetOppositeBranchCondition(CC);
27233         std::swap(TrueC, FalseC);
27234         std::swap(TrueOp, FalseOp);
27235       }
27236
27237       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
27238       // This is efficient for any integer data type (including i8/i16) and
27239       // shift amount.
27240       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
27241         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27242                            DAG.getConstant(CC, DL, MVT::i8), Cond);
27243
27244         // Zero extend the condition if needed.
27245         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
27246
27247         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
27248         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
27249                            DAG.getConstant(ShAmt, DL, MVT::i8));
27250         if (N->getNumValues() == 2)  // Dead flag value?
27251           return DCI.CombineTo(N, Cond, SDValue());
27252         return Cond;
27253       }
27254
27255       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
27256       // for any integer data type, including i8/i16.
27257       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
27258         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27259                            DAG.getConstant(CC, DL, MVT::i8), Cond);
27260
27261         // Zero extend the condition if needed.
27262         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
27263                            FalseC->getValueType(0), Cond);
27264         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27265                            SDValue(FalseC, 0));
27266
27267         if (N->getNumValues() == 2)  // Dead flag value?
27268           return DCI.CombineTo(N, Cond, SDValue());
27269         return Cond;
27270       }
27271
27272       // Optimize cases that will turn into an LEA instruction.  This requires
27273       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
27274       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
27275         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
27276         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
27277
27278         bool isFastMultiplier = false;
27279         if (Diff < 10) {
27280           switch ((unsigned char)Diff) {
27281           default: break;
27282           case 1:  // result = add base, cond
27283           case 2:  // result = lea base(    , cond*2)
27284           case 3:  // result = lea base(cond, cond*2)
27285           case 4:  // result = lea base(    , cond*4)
27286           case 5:  // result = lea base(cond, cond*4)
27287           case 8:  // result = lea base(    , cond*8)
27288           case 9:  // result = lea base(cond, cond*8)
27289             isFastMultiplier = true;
27290             break;
27291           }
27292         }
27293
27294         if (isFastMultiplier) {
27295           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
27296           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27297                              DAG.getConstant(CC, DL, MVT::i8), Cond);
27298           // Zero extend the condition if needed.
27299           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
27300                              Cond);
27301           // Scale the condition by the difference.
27302           if (Diff != 1)
27303             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
27304                                DAG.getConstant(Diff, DL, Cond.getValueType()));
27305
27306           // Add the base if non-zero.
27307           if (FalseC->getAPIntValue() != 0)
27308             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27309                                SDValue(FalseC, 0));
27310           if (N->getNumValues() == 2)  // Dead flag value?
27311             return DCI.CombineTo(N, Cond, SDValue());
27312           return Cond;
27313         }
27314       }
27315     }
27316   }
27317
27318   // Handle these cases:
27319   //   (select (x != c), e, c) -> select (x != c), e, x),
27320   //   (select (x == c), c, e) -> select (x == c), x, e)
27321   // where the c is an integer constant, and the "select" is the combination
27322   // of CMOV and CMP.
27323   //
27324   // The rationale for this change is that the conditional-move from a constant
27325   // needs two instructions, however, conditional-move from a register needs
27326   // only one instruction.
27327   //
27328   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
27329   //  some instruction-combining opportunities. This opt needs to be
27330   //  postponed as late as possible.
27331   //
27332   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
27333     // the DCI.xxxx conditions are provided to postpone the optimization as
27334     // late as possible.
27335
27336     ConstantSDNode *CmpAgainst = nullptr;
27337     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
27338         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
27339         !isa<ConstantSDNode>(Cond.getOperand(0))) {
27340
27341       if (CC == X86::COND_NE &&
27342           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
27343         CC = X86::GetOppositeBranchCondition(CC);
27344         std::swap(TrueOp, FalseOp);
27345       }
27346
27347       if (CC == X86::COND_E &&
27348           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
27349         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
27350                           DAG.getConstant(CC, DL, MVT::i8), Cond };
27351         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
27352       }
27353     }
27354   }
27355
27356   // Fold and/or of setcc's to double CMOV:
27357   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
27358   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
27359   //
27360   // This combine lets us generate:
27361   //   cmovcc1 (jcc1 if we don't have CMOV)
27362   //   cmovcc2 (same)
27363   // instead of:
27364   //   setcc1
27365   //   setcc2
27366   //   and/or
27367   //   cmovne (jne if we don't have CMOV)
27368   // When we can't use the CMOV instruction, it might increase branch
27369   // mispredicts.
27370   // When we can use CMOV, or when there is no mispredict, this improves
27371   // throughput and reduces register pressure.
27372   //
27373   if (CC == X86::COND_NE) {
27374     SDValue Flags;
27375     X86::CondCode CC0, CC1;
27376     bool isAndSetCC;
27377     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
27378       if (isAndSetCC) {
27379         std::swap(FalseOp, TrueOp);
27380         CC0 = X86::GetOppositeBranchCondition(CC0);
27381         CC1 = X86::GetOppositeBranchCondition(CC1);
27382       }
27383
27384       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
27385         Flags};
27386       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
27387       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
27388       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27389       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
27390       return CMOV;
27391     }
27392   }
27393
27394   return SDValue();
27395 }
27396
27397 /// Different mul shrinking modes.
27398 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
27399
27400 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
27401   EVT VT = N->getOperand(0).getValueType();
27402   if (VT.getScalarSizeInBits() != 32)
27403     return false;
27404
27405   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
27406   unsigned SignBits[2] = {1, 1};
27407   bool IsPositive[2] = {false, false};
27408   for (unsigned i = 0; i < 2; i++) {
27409     SDValue Opd = N->getOperand(i);
27410
27411     // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
27412     // compute signbits for it separately.
27413     if (Opd.getOpcode() == ISD::ANY_EXTEND) {
27414       // For anyextend, it is safe to assume an appropriate number of leading
27415       // sign/zero bits.
27416       if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
27417         SignBits[i] = 25;
27418       else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
27419                MVT::i16)
27420         SignBits[i] = 17;
27421       else
27422         return false;
27423       IsPositive[i] = true;
27424     } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
27425       // All the operands of BUILD_VECTOR need to be int constant.
27426       // Find the smallest value range which all the operands belong to.
27427       SignBits[i] = 32;
27428       IsPositive[i] = true;
27429       for (const SDValue &SubOp : Opd.getNode()->op_values()) {
27430         if (SubOp.isUndef())
27431           continue;
27432         auto *CN = dyn_cast<ConstantSDNode>(SubOp);
27433         if (!CN)
27434           return false;
27435         APInt IntVal = CN->getAPIntValue();
27436         if (IntVal.isNegative())
27437           IsPositive[i] = false;
27438         SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
27439       }
27440     } else {
27441       SignBits[i] = DAG.ComputeNumSignBits(Opd);
27442       if (Opd.getOpcode() == ISD::ZERO_EXTEND)
27443         IsPositive[i] = true;
27444     }
27445   }
27446
27447   bool AllPositive = IsPositive[0] && IsPositive[1];
27448   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
27449   // When ranges are from -128 ~ 127, use MULS8 mode.
27450   if (MinSignBits >= 25)
27451     Mode = MULS8;
27452   // When ranges are from 0 ~ 255, use MULU8 mode.
27453   else if (AllPositive && MinSignBits >= 24)
27454     Mode = MULU8;
27455   // When ranges are from -32768 ~ 32767, use MULS16 mode.
27456   else if (MinSignBits >= 17)
27457     Mode = MULS16;
27458   // When ranges are from 0 ~ 65535, use MULU16 mode.
27459   else if (AllPositive && MinSignBits >= 16)
27460     Mode = MULU16;
27461   else
27462     return false;
27463   return true;
27464 }
27465
27466 /// When the operands of vector mul are extended from smaller size values,
27467 /// like i8 and i16, the type of mul may be shrinked to generate more
27468 /// efficient code. Two typical patterns are handled:
27469 /// Pattern1:
27470 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
27471 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
27472 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27473 ///     %5 = mul <N x i32> %2, %4
27474 ///
27475 /// Pattern2:
27476 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
27477 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
27478 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27479 ///     %5 = mul <N x i32> %2, %4
27480 ///
27481 /// There are four mul shrinking modes:
27482 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
27483 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
27484 /// generate pmullw+sext32 for it (MULS8 mode).
27485 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
27486 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
27487 /// generate pmullw+zext32 for it (MULU8 mode).
27488 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
27489 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
27490 /// generate pmullw+pmulhw for it (MULS16 mode).
27491 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
27492 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
27493 /// generate pmullw+pmulhuw for it (MULU16 mode).
27494 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
27495                                const X86Subtarget &Subtarget) {
27496   // pmulld is supported since SSE41. It is better to use pmulld
27497   // instead of pmullw+pmulhw.
27498   if (Subtarget.hasSSE41())
27499     return SDValue();
27500
27501   ShrinkMode Mode;
27502   if (!canReduceVMulWidth(N, DAG, Mode))
27503     return SDValue();
27504
27505   SDLoc DL(N);
27506   SDValue N0 = N->getOperand(0);
27507   SDValue N1 = N->getOperand(1);
27508   EVT VT = N->getOperand(0).getValueType();
27509   unsigned RegSize = 128;
27510   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
27511   EVT ReducedVT =
27512       EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
27513   // Shrink the operands of mul.
27514   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
27515   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
27516
27517   if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
27518     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
27519     // lower part is needed.
27520     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
27521     if (Mode == MULU8 || Mode == MULS8) {
27522       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
27523                          DL, VT, MulLo);
27524     } else {
27525       MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27526       // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
27527       // the higher part is also needed.
27528       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27529                                   ReducedVT, NewN0, NewN1);
27530
27531       // Repack the lower part and higher part result of mul into a wider
27532       // result.
27533       // Generate shuffle functioning as punpcklwd.
27534       SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
27535       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27536         ShuffleMask[2 * i] = i;
27537         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
27538       }
27539       SDValue ResLo =
27540           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27541       ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
27542       // Generate shuffle functioning as punpckhwd.
27543       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27544         ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
27545         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
27546       }
27547       SDValue ResHi =
27548           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27549       ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
27550       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
27551     }
27552   } else {
27553     // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
27554     // to legalize the mul explicitly because implicit legalization for type
27555     // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
27556     // instructions which will not exist when we explicitly legalize it by
27557     // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
27558     // <4 x i16> undef).
27559     //
27560     // Legalize the operands of mul.
27561     SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
27562                                  DAG.getUNDEF(ReducedVT));
27563     Ops[0] = NewN0;
27564     NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27565     Ops[0] = NewN1;
27566     NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27567
27568     if (Mode == MULU8 || Mode == MULS8) {
27569       // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
27570       // part is needed.
27571       SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27572
27573       // convert the type of mul result to VT.
27574       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27575       SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
27576                                               : ISD::SIGN_EXTEND_VECTOR_INREG,
27577                                 DL, ResVT, Mul);
27578       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27579                          DAG.getIntPtrConstant(0, DL));
27580     } else {
27581       // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
27582       // MULU16/MULS16, both parts are needed.
27583       SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27584       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27585                                   OpsVT, NewN0, NewN1);
27586
27587       // Repack the lower part and higher part result of mul into a wider
27588       // result. Make sure the type of mul result is VT.
27589       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27590       SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
27591       Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
27592       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27593                          DAG.getIntPtrConstant(0, DL));
27594     }
27595   }
27596 }
27597
27598 /// Optimize a single multiply with constant into two operations in order to
27599 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
27600 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
27601                           TargetLowering::DAGCombinerInfo &DCI,
27602                           const X86Subtarget &Subtarget) {
27603   EVT VT = N->getValueType(0);
27604   if (DCI.isBeforeLegalize() && VT.isVector())
27605     return reduceVMULWidth(N, DAG, Subtarget);
27606
27607   // An imul is usually smaller than the alternative sequence.
27608   if (DAG.getMachineFunction().getFunction()->optForMinSize())
27609     return SDValue();
27610
27611   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
27612     return SDValue();
27613
27614   if (VT != MVT::i64 && VT != MVT::i32)
27615     return SDValue();
27616
27617   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
27618   if (!C)
27619     return SDValue();
27620   uint64_t MulAmt = C->getZExtValue();
27621   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
27622     return SDValue();
27623
27624   uint64_t MulAmt1 = 0;
27625   uint64_t MulAmt2 = 0;
27626   if ((MulAmt % 9) == 0) {
27627     MulAmt1 = 9;
27628     MulAmt2 = MulAmt / 9;
27629   } else if ((MulAmt % 5) == 0) {
27630     MulAmt1 = 5;
27631     MulAmt2 = MulAmt / 5;
27632   } else if ((MulAmt % 3) == 0) {
27633     MulAmt1 = 3;
27634     MulAmt2 = MulAmt / 3;
27635   }
27636
27637   SDLoc DL(N);
27638   SDValue NewMul;
27639   if (MulAmt2 &&
27640       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
27641
27642     if (isPowerOf2_64(MulAmt2) &&
27643         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
27644       // If second multiplifer is pow2, issue it first. We want the multiply by
27645       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
27646       // is an add.
27647       std::swap(MulAmt1, MulAmt2);
27648
27649     if (isPowerOf2_64(MulAmt1))
27650       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27651                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
27652     else
27653       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
27654                            DAG.getConstant(MulAmt1, DL, VT));
27655
27656     if (isPowerOf2_64(MulAmt2))
27657       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
27658                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
27659     else
27660       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
27661                            DAG.getConstant(MulAmt2, DL, VT));
27662   }
27663
27664   if (!NewMul) {
27665     assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
27666            && "Both cases that could cause potential overflows should have "
27667               "already been handled.");
27668     if (isPowerOf2_64(MulAmt - 1))
27669       // (mul x, 2^N + 1) => (add (shl x, N), x)
27670       NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
27671                                 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27672                                 DAG.getConstant(Log2_64(MulAmt - 1), DL,
27673                                 MVT::i8)));
27674
27675     else if (isPowerOf2_64(MulAmt + 1))
27676       // (mul x, 2^N - 1) => (sub (shl x, N), x)
27677       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
27678                                 N->getOperand(0),
27679                                 DAG.getConstant(Log2_64(MulAmt + 1),
27680                                 DL, MVT::i8)), N->getOperand(0));
27681   }
27682
27683   if (NewMul)
27684     // Do not add new nodes to DAG combiner worklist.
27685     DCI.CombineTo(N, NewMul, false);
27686
27687   return SDValue();
27688 }
27689
27690 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
27691   SDValue N0 = N->getOperand(0);
27692   SDValue N1 = N->getOperand(1);
27693   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
27694   EVT VT = N0.getValueType();
27695
27696   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
27697   // since the result of setcc_c is all zero's or all ones.
27698   if (VT.isInteger() && !VT.isVector() &&
27699       N1C && N0.getOpcode() == ISD::AND &&
27700       N0.getOperand(1).getOpcode() == ISD::Constant) {
27701     SDValue N00 = N0.getOperand(0);
27702     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
27703     const APInt &ShAmt = N1C->getAPIntValue();
27704     Mask = Mask.shl(ShAmt);
27705     bool MaskOK = false;
27706     // We can handle cases concerning bit-widening nodes containing setcc_c if
27707     // we carefully interrogate the mask to make sure we are semantics
27708     // preserving.
27709     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
27710     // of the underlying setcc_c operation if the setcc_c was zero extended.
27711     // Consider the following example:
27712     //   zext(setcc_c)                 -> i32 0x0000FFFF
27713     //   c1                            -> i32 0x0000FFFF
27714     //   c2                            -> i32 0x00000001
27715     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
27716     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
27717     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
27718       MaskOK = true;
27719     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
27720                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27721       MaskOK = true;
27722     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
27723                 N00.getOpcode() == ISD::ANY_EXTEND) &&
27724                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27725       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
27726     }
27727     if (MaskOK && Mask != 0) {
27728       SDLoc DL(N);
27729       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
27730     }
27731   }
27732
27733   // Hardware support for vector shifts is sparse which makes us scalarize the
27734   // vector operations in many cases. Also, on sandybridge ADD is faster than
27735   // shl.
27736   // (shl V, 1) -> add V,V
27737   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
27738     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
27739       assert(N0.getValueType().isVector() && "Invalid vector shift type");
27740       // We shift all of the values by one. In many cases we do not have
27741       // hardware support for this operation. This is better expressed as an ADD
27742       // of two values.
27743       if (N1SplatC->getAPIntValue() == 1)
27744         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
27745     }
27746
27747   return SDValue();
27748 }
27749
27750 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
27751   SDValue N0 = N->getOperand(0);
27752   SDValue N1 = N->getOperand(1);
27753   EVT VT = N0.getValueType();
27754   unsigned Size = VT.getSizeInBits();
27755
27756   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
27757   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
27758   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
27759   // depending on sign of (SarConst - [56,48,32,24,16])
27760
27761   // sexts in X86 are MOVs. The MOVs have the same code size
27762   // as above SHIFTs (only SHIFT on 1 has lower code size).
27763   // However the MOVs have 2 advantages to a SHIFT:
27764   // 1. MOVs can write to a register that differs from source
27765   // 2. MOVs accept memory operands
27766
27767   if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
27768       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
27769       N0.getOperand(1).getOpcode() != ISD::Constant)
27770     return SDValue();
27771
27772   SDValue N00 = N0.getOperand(0);
27773   SDValue N01 = N0.getOperand(1);
27774   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
27775   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
27776   EVT CVT = N1.getValueType();
27777
27778   if (SarConst.isNegative())
27779     return SDValue();
27780
27781   for (MVT SVT : MVT::integer_valuetypes()) {
27782     unsigned ShiftSize = SVT.getSizeInBits();
27783     // skipping types without corresponding sext/zext and
27784     // ShlConst that is not one of [56,48,32,24,16]
27785     if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
27786       continue;
27787     SDLoc DL(N);
27788     SDValue NN =
27789         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
27790     SarConst = SarConst - (Size - ShiftSize);
27791     if (SarConst == 0)
27792       return NN;
27793     else if (SarConst.isNegative())
27794       return DAG.getNode(ISD::SHL, DL, VT, NN,
27795                          DAG.getConstant(-SarConst, DL, CVT));
27796     else
27797       return DAG.getNode(ISD::SRA, DL, VT, NN,
27798                          DAG.getConstant(SarConst, DL, CVT));
27799   }
27800   return SDValue();
27801 }
27802
27803 /// \brief Returns a vector of 0s if the node in input is a vector logical
27804 /// shift by a constant amount which is known to be bigger than or equal
27805 /// to the vector element size in bits.
27806 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
27807                                       const X86Subtarget &Subtarget) {
27808   EVT VT = N->getValueType(0);
27809
27810   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
27811       (!Subtarget.hasInt256() ||
27812        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
27813     return SDValue();
27814
27815   SDValue Amt = N->getOperand(1);
27816   SDLoc DL(N);
27817   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
27818     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
27819       const APInt &ShiftAmt = AmtSplat->getAPIntValue();
27820       unsigned MaxAmount =
27821         VT.getSimpleVT().getVectorElementType().getSizeInBits();
27822
27823       // SSE2/AVX2 logical shifts always return a vector of 0s
27824       // if the shift amount is bigger than or equal to
27825       // the element size. The constant shift amount will be
27826       // encoded as a 8-bit immediate.
27827       if (ShiftAmt.trunc(8).uge(MaxAmount))
27828         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
27829     }
27830
27831   return SDValue();
27832 }
27833
27834 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
27835                             TargetLowering::DAGCombinerInfo &DCI,
27836                             const X86Subtarget &Subtarget) {
27837   if (N->getOpcode() == ISD::SHL)
27838     if (SDValue V = combineShiftLeft(N, DAG))
27839       return V;
27840
27841   if (N->getOpcode() == ISD::SRA)
27842     if (SDValue V = combineShiftRightAlgebraic(N, DAG))
27843       return V;
27844
27845   // Try to fold this logical shift into a zero vector.
27846   if (N->getOpcode() != ISD::SRA)
27847     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
27848       return V;
27849
27850   return SDValue();
27851 }
27852
27853 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
27854 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
27855 /// OR -> CMPNEQSS.
27856 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
27857                                    TargetLowering::DAGCombinerInfo &DCI,
27858                                    const X86Subtarget &Subtarget) {
27859   unsigned opcode;
27860
27861   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
27862   // we're requiring SSE2 for both.
27863   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
27864     SDValue N0 = N->getOperand(0);
27865     SDValue N1 = N->getOperand(1);
27866     SDValue CMP0 = N0->getOperand(1);
27867     SDValue CMP1 = N1->getOperand(1);
27868     SDLoc DL(N);
27869
27870     // The SETCCs should both refer to the same CMP.
27871     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
27872       return SDValue();
27873
27874     SDValue CMP00 = CMP0->getOperand(0);
27875     SDValue CMP01 = CMP0->getOperand(1);
27876     EVT     VT    = CMP00.getValueType();
27877
27878     if (VT == MVT::f32 || VT == MVT::f64) {
27879       bool ExpectingFlags = false;
27880       // Check for any users that want flags:
27881       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
27882            !ExpectingFlags && UI != UE; ++UI)
27883         switch (UI->getOpcode()) {
27884         default:
27885         case ISD::BR_CC:
27886         case ISD::BRCOND:
27887         case ISD::SELECT:
27888           ExpectingFlags = true;
27889           break;
27890         case ISD::CopyToReg:
27891         case ISD::SIGN_EXTEND:
27892         case ISD::ZERO_EXTEND:
27893         case ISD::ANY_EXTEND:
27894           break;
27895         }
27896
27897       if (!ExpectingFlags) {
27898         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
27899         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
27900
27901         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
27902           X86::CondCode tmp = cc0;
27903           cc0 = cc1;
27904           cc1 = tmp;
27905         }
27906
27907         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
27908             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
27909           // FIXME: need symbolic constants for these magic numbers.
27910           // See X86ATTInstPrinter.cpp:printSSECC().
27911           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
27912           if (Subtarget.hasAVX512()) {
27913             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
27914                                          CMP01,
27915                                          DAG.getConstant(x86cc, DL, MVT::i8));
27916             if (N->getValueType(0) != MVT::i1)
27917               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
27918                                  FSetCC);
27919             return FSetCC;
27920           }
27921           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
27922                                               CMP00.getValueType(), CMP00, CMP01,
27923                                               DAG.getConstant(x86cc, DL,
27924                                                               MVT::i8));
27925
27926           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
27927           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
27928
27929           if (is64BitFP && !Subtarget.is64Bit()) {
27930             // On a 32-bit target, we cannot bitcast the 64-bit float to a
27931             // 64-bit integer, since that's not a legal type. Since
27932             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
27933             // bits, but can do this little dance to extract the lowest 32 bits
27934             // and work with those going forward.
27935             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
27936                                            OnesOrZeroesF);
27937             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
27938             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
27939                                         Vector32, DAG.getIntPtrConstant(0, DL));
27940             IntVT = MVT::i32;
27941           }
27942
27943           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
27944           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
27945                                       DAG.getConstant(1, DL, IntVT));
27946           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27947                                               ANDed);
27948           return OneBitOfTruth;
27949         }
27950       }
27951     }
27952   }
27953   return SDValue();
27954 }
27955
27956 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
27957 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
27958   assert(N->getOpcode() == ISD::AND);
27959
27960   EVT VT = N->getValueType(0);
27961   SDValue N0 = N->getOperand(0);
27962   SDValue N1 = N->getOperand(1);
27963   SDLoc DL(N);
27964
27965   if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
27966       VT != MVT::v8i64 && VT != MVT::v16i32 &&
27967       VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
27968     return SDValue();
27969
27970   // Canonicalize XOR to the left.
27971   if (N1.getOpcode() == ISD::XOR)
27972     std::swap(N0, N1);
27973
27974   if (N0.getOpcode() != ISD::XOR)
27975     return SDValue();
27976
27977   SDValue N00 = N0->getOperand(0);
27978   SDValue N01 = N0->getOperand(1);
27979
27980   N01 = peekThroughBitcasts(N01);
27981
27982   // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
27983   // insert_subvector building a 256-bit AllOnes vector.
27984   if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
27985     if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
27986       return SDValue();
27987
27988     SDValue V1 = N01->getOperand(0);
27989     SDValue V2 = N01->getOperand(1);
27990     if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
27991         !V1.getOperand(0).isUndef() ||
27992         !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
27993         !ISD::isBuildVectorAllOnes(V2.getNode()))
27994       return SDValue();
27995   }
27996   return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
27997 }
27998
27999 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
28000 // register. In most cases we actually compare or select YMM-sized registers
28001 // and mixing the two types creates horrible code. This method optimizes
28002 // some of the transition sequences.
28003 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
28004                                  TargetLowering::DAGCombinerInfo &DCI,
28005                                  const X86Subtarget &Subtarget) {
28006   EVT VT = N->getValueType(0);
28007   if (!VT.is256BitVector())
28008     return SDValue();
28009
28010   assert((N->getOpcode() == ISD::ANY_EXTEND ||
28011           N->getOpcode() == ISD::ZERO_EXTEND ||
28012           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
28013
28014   SDValue Narrow = N->getOperand(0);
28015   EVT NarrowVT = Narrow->getValueType(0);
28016   if (!NarrowVT.is128BitVector())
28017     return SDValue();
28018
28019   if (Narrow->getOpcode() != ISD::XOR &&
28020       Narrow->getOpcode() != ISD::AND &&
28021       Narrow->getOpcode() != ISD::OR)
28022     return SDValue();
28023
28024   SDValue N0  = Narrow->getOperand(0);
28025   SDValue N1  = Narrow->getOperand(1);
28026   SDLoc DL(Narrow);
28027
28028   // The Left side has to be a trunc.
28029   if (N0.getOpcode() != ISD::TRUNCATE)
28030     return SDValue();
28031
28032   // The type of the truncated inputs.
28033   EVT WideVT = N0->getOperand(0)->getValueType(0);
28034   if (WideVT != VT)
28035     return SDValue();
28036
28037   // The right side has to be a 'trunc' or a constant vector.
28038   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
28039   ConstantSDNode *RHSConstSplat = nullptr;
28040   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
28041     RHSConstSplat = RHSBV->getConstantSplatNode();
28042   if (!RHSTrunc && !RHSConstSplat)
28043     return SDValue();
28044
28045   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28046
28047   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
28048     return SDValue();
28049
28050   // Set N0 and N1 to hold the inputs to the new wide operation.
28051   N0 = N0->getOperand(0);
28052   if (RHSConstSplat) {
28053     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
28054                      SDValue(RHSConstSplat, 0));
28055     N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
28056   } else if (RHSTrunc) {
28057     N1 = N1->getOperand(0);
28058   }
28059
28060   // Generate the wide operation.
28061   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
28062   unsigned Opcode = N->getOpcode();
28063   switch (Opcode) {
28064   case ISD::ANY_EXTEND:
28065     return Op;
28066   case ISD::ZERO_EXTEND: {
28067     unsigned InBits = NarrowVT.getScalarSizeInBits();
28068     APInt Mask = APInt::getAllOnesValue(InBits);
28069     Mask = Mask.zext(VT.getScalarSizeInBits());
28070     return DAG.getNode(ISD::AND, DL, VT,
28071                        Op, DAG.getConstant(Mask, DL, VT));
28072   }
28073   case ISD::SIGN_EXTEND:
28074     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
28075                        Op, DAG.getValueType(NarrowVT));
28076   default:
28077     llvm_unreachable("Unexpected opcode");
28078   }
28079 }
28080
28081 static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
28082                                  TargetLowering::DAGCombinerInfo &DCI,
28083                                  const X86Subtarget &Subtarget) {
28084   SDValue N0 = N->getOperand(0);
28085   SDValue N1 = N->getOperand(1);
28086   SDLoc DL(N);
28087
28088   // A vector zext_in_reg may be represented as a shuffle,
28089   // feeding into a bitcast (this represents anyext) feeding into
28090   // an and with a mask.
28091   // We'd like to try to combine that into a shuffle with zero
28092   // plus a bitcast, removing the and.
28093   if (N0.getOpcode() != ISD::BITCAST ||
28094       N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
28095     return SDValue();
28096
28097   // The other side of the AND should be a splat of 2^C, where C
28098   // is the number of bits in the source type.
28099   N1 = peekThroughBitcasts(N1);
28100   if (N1.getOpcode() != ISD::BUILD_VECTOR)
28101     return SDValue();
28102   BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
28103
28104   ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
28105   EVT SrcType = Shuffle->getValueType(0);
28106
28107   // We expect a single-source shuffle
28108   if (!Shuffle->getOperand(1)->isUndef())
28109     return SDValue();
28110
28111   unsigned SrcSize = SrcType.getScalarSizeInBits();
28112   unsigned NumElems = SrcType.getVectorNumElements();
28113
28114   APInt SplatValue, SplatUndef;
28115   unsigned SplatBitSize;
28116   bool HasAnyUndefs;
28117   if (!Vector->isConstantSplat(SplatValue, SplatUndef,
28118                                 SplatBitSize, HasAnyUndefs))
28119     return SDValue();
28120
28121   unsigned ResSize = N1.getValueType().getScalarSizeInBits();
28122   // Make sure the splat matches the mask we expect
28123   if (SplatBitSize > ResSize ||
28124       (SplatValue + 1).exactLogBase2() != (int)SrcSize)
28125     return SDValue();
28126
28127   // Make sure the input and output size make sense
28128   if (SrcSize >= ResSize || ResSize % SrcSize)
28129     return SDValue();
28130
28131   // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
28132   // The number of u's between each two values depends on the ratio between
28133   // the source and dest type.
28134   unsigned ZextRatio = ResSize / SrcSize;
28135   bool IsZext = true;
28136   for (unsigned i = 0; i != NumElems; ++i) {
28137     if (i % ZextRatio) {
28138       if (Shuffle->getMaskElt(i) > 0) {
28139         // Expected undef
28140         IsZext = false;
28141         break;
28142       }
28143     } else {
28144       if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
28145         // Expected element number
28146         IsZext = false;
28147         break;
28148       }
28149     }
28150   }
28151
28152   if (!IsZext)
28153     return SDValue();
28154
28155   // Ok, perform the transformation - replace the shuffle with
28156   // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
28157   // (instead of undef) where the k elements come from the zero vector.
28158   SmallVector<int, 8> Mask;
28159   for (unsigned i = 0; i != NumElems; ++i)
28160     if (i % ZextRatio)
28161       Mask.push_back(NumElems);
28162     else
28163       Mask.push_back(i / ZextRatio);
28164
28165   SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
28166     Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
28167   return DAG.getBitcast(N0.getValueType(), NewShuffle);
28168 }
28169
28170 /// If both input operands of a logic op are being cast from floating point
28171 /// types, try to convert this into a floating point logic node to avoid
28172 /// unnecessary moves from SSE to integer registers.
28173 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
28174                                         const X86Subtarget &Subtarget) {
28175   unsigned FPOpcode = ISD::DELETED_NODE;
28176   if (N->getOpcode() == ISD::AND)
28177     FPOpcode = X86ISD::FAND;
28178   else if (N->getOpcode() == ISD::OR)
28179     FPOpcode = X86ISD::FOR;
28180   else if (N->getOpcode() == ISD::XOR)
28181     FPOpcode = X86ISD::FXOR;
28182
28183   assert(FPOpcode != ISD::DELETED_NODE &&
28184          "Unexpected input node for FP logic conversion");
28185
28186   EVT VT = N->getValueType(0);
28187   SDValue N0 = N->getOperand(0);
28188   SDValue N1 = N->getOperand(1);
28189   SDLoc DL(N);
28190   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
28191       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
28192        (Subtarget.hasSSE2() && VT == MVT::i64))) {
28193     SDValue N00 = N0.getOperand(0);
28194     SDValue N10 = N1.getOperand(0);
28195     EVT N00Type = N00.getValueType();
28196     EVT N10Type = N10.getValueType();
28197     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
28198       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
28199       return DAG.getBitcast(VT, FPLogic);
28200     }
28201   }
28202   return SDValue();
28203 }
28204
28205 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
28206 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
28207 /// eliminate loading the vector constant mask value. This relies on the fact
28208 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
28209 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
28210   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
28211   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
28212
28213   // TODO: Use AssertSext to mark any nodes that have the property of producing
28214   // all-ones or all-zeros. Then check for that node rather than particular
28215   // opcodes.
28216   if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
28217     return SDValue();
28218
28219   // The existence of the PCMP node guarantees that we have the required SSE2 or
28220   // AVX2 for a shift of this vector type, but there is no vector shift by
28221   // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
28222   // masked compare nodes, so they should not make it here.
28223   EVT VT0 = Op0.getValueType();
28224   EVT VT1 = Op1.getValueType();
28225   unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
28226   if (VT0 != VT1 || EltBitWidth == 8)
28227     return SDValue();
28228
28229   assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
28230
28231   APInt SplatVal;
28232   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
28233     return SDValue();
28234
28235   SDLoc DL(N);
28236   SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
28237   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
28238   return DAG.getBitcast(N->getValueType(0), Shift);
28239 }
28240
28241 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
28242                           TargetLowering::DAGCombinerInfo &DCI,
28243                           const X86Subtarget &Subtarget) {
28244   if (DCI.isBeforeLegalizeOps())
28245     return SDValue();
28246
28247   if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
28248     return Zext;
28249
28250   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28251     return R;
28252
28253   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28254     return FPLogic;
28255
28256   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
28257     return R;
28258
28259   if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
28260     return ShiftRight;
28261
28262   EVT VT = N->getValueType(0);
28263   SDValue N0 = N->getOperand(0);
28264   SDValue N1 = N->getOperand(1);
28265   SDLoc DL(N);
28266
28267   // Create BEXTR instructions
28268   // BEXTR is ((X >> imm) & (2**size-1))
28269   if (VT != MVT::i32 && VT != MVT::i64)
28270     return SDValue();
28271
28272   if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
28273     return SDValue();
28274   if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
28275     return SDValue();
28276
28277   ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
28278   ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
28279   if (MaskNode && ShiftNode) {
28280     uint64_t Mask = MaskNode->getZExtValue();
28281     uint64_t Shift = ShiftNode->getZExtValue();
28282     if (isMask_64(Mask)) {
28283       uint64_t MaskSize = countPopulation(Mask);
28284       if (Shift + MaskSize <= VT.getSizeInBits())
28285         return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
28286                            DAG.getConstant(Shift | (MaskSize << 8), DL,
28287                                            VT));
28288     }
28289   }
28290   return SDValue();
28291 }
28292
28293 // Try to fold:
28294 //   (or (and (m, y), (pandn m, x)))
28295 // into:
28296 //   (vselect m, x, y)
28297 // As a special case, try to fold:
28298 //   (or (and (m, (sub 0, x)), (pandn m, x)))
28299 // into:
28300 //   (sub (xor X, M), M)
28301 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
28302                                             const X86Subtarget &Subtarget) {
28303   assert(N->getOpcode() == ISD::OR);
28304
28305   SDValue N0 = N->getOperand(0);
28306   SDValue N1 = N->getOperand(1);
28307   EVT VT = N->getValueType(0);
28308
28309   if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
28310     return SDValue();
28311   assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
28312
28313   // Canonicalize pandn to RHS
28314   if (N0.getOpcode() == X86ISD::ANDNP)
28315     std::swap(N0, N1);
28316
28317   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
28318     return SDValue();
28319
28320   SDValue Mask = N1.getOperand(0);
28321   SDValue X = N1.getOperand(1);
28322   SDValue Y;
28323   if (N0.getOperand(0) == Mask)
28324     Y = N0.getOperand(1);
28325   if (N0.getOperand(1) == Mask)
28326     Y = N0.getOperand(0);
28327
28328   // Check to see if the mask appeared in both the AND and ANDNP.
28329   if (!Y.getNode())
28330     return SDValue();
28331
28332   // Validate that X, Y, and Mask are bitcasts, and see through them.
28333   Mask = peekThroughBitcasts(Mask);
28334   X = peekThroughBitcasts(X);
28335   Y = peekThroughBitcasts(Y);
28336
28337   EVT MaskVT = Mask.getValueType();
28338
28339   // Validate that the Mask operand is a vector sra node.
28340   // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
28341   // there is no psrai.b
28342   unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
28343   unsigned SraAmt = ~0;
28344   if (Mask.getOpcode() == ISD::SRA) {
28345     if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
28346       if (auto *AmtConst = AmtBV->getConstantSplatNode())
28347         SraAmt = AmtConst->getZExtValue();
28348   } else if (Mask.getOpcode() == X86ISD::VSRAI) {
28349     SDValue SraC = Mask.getOperand(1);
28350     SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
28351   }
28352   if ((SraAmt + 1) != EltBits)
28353     return SDValue();
28354
28355   SDLoc DL(N);
28356
28357   // Try to match:
28358   //   (or (and (M, (sub 0, X)), (pandn M, X)))
28359   // which is a special case of vselect:
28360   //   (vselect M, (sub 0, X), X)
28361   // Per:
28362   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
28363   // We know that, if fNegate is 0 or 1:
28364   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
28365   //
28366   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
28367   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
28368   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
28369   // This lets us transform our vselect to:
28370   //   (add (xor X, M), (and M, 1))
28371   // And further to:
28372   //   (sub (xor X, M), M)
28373   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
28374     auto IsNegV = [](SDNode *N, SDValue V) {
28375       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
28376         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
28377     };
28378     SDValue V;
28379     if (IsNegV(Y.getNode(), X))
28380       V = X;
28381     else if (IsNegV(X.getNode(), Y))
28382       V = Y;
28383
28384     if (V) {
28385       assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
28386       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
28387       SDValue SubOp2 = Mask;
28388
28389       // If the negate was on the false side of the select, then
28390       // the operands of the SUB need to be swapped. PR 27251.
28391       // This is because the pattern being matched above is
28392       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
28393       // but if the pattern matched was
28394       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
28395       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
28396       // pattern also needs to be a negation of the replacement pattern above.
28397       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
28398       // sub accomplishes the negation of the replacement pattern.
28399       if (V == Y)
28400          std::swap(SubOp1, SubOp2);
28401
28402       return DAG.getBitcast(VT,
28403                             DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
28404     }
28405   }
28406
28407   // PBLENDVB is only available on SSE 4.1.
28408   if (!Subtarget.hasSSE41())
28409     return SDValue();
28410
28411   MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
28412
28413   X = DAG.getBitcast(BlendVT, X);
28414   Y = DAG.getBitcast(BlendVT, Y);
28415   Mask = DAG.getBitcast(BlendVT, Mask);
28416   Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
28417   return DAG.getBitcast(VT, Mask);
28418 }
28419
28420 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
28421                          TargetLowering::DAGCombinerInfo &DCI,
28422                          const X86Subtarget &Subtarget) {
28423   if (DCI.isBeforeLegalizeOps())
28424     return SDValue();
28425
28426   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28427     return R;
28428
28429   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28430     return FPLogic;
28431
28432   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
28433     return R;
28434
28435   SDValue N0 = N->getOperand(0);
28436   SDValue N1 = N->getOperand(1);
28437   EVT VT = N->getValueType(0);
28438
28439   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
28440     return SDValue();
28441
28442   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
28443   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
28444
28445   // SHLD/SHRD instructions have lower register pressure, but on some
28446   // platforms they have higher latency than the equivalent
28447   // series of shifts/or that would otherwise be generated.
28448   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
28449   // have higher latencies and we are not optimizing for size.
28450   if (!OptForSize && Subtarget.isSHLDSlow())
28451     return SDValue();
28452
28453   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
28454     std::swap(N0, N1);
28455   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
28456     return SDValue();
28457   if (!N0.hasOneUse() || !N1.hasOneUse())
28458     return SDValue();
28459
28460   SDValue ShAmt0 = N0.getOperand(1);
28461   if (ShAmt0.getValueType() != MVT::i8)
28462     return SDValue();
28463   SDValue ShAmt1 = N1.getOperand(1);
28464   if (ShAmt1.getValueType() != MVT::i8)
28465     return SDValue();
28466   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
28467     ShAmt0 = ShAmt0.getOperand(0);
28468   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
28469     ShAmt1 = ShAmt1.getOperand(0);
28470
28471   SDLoc DL(N);
28472   unsigned Opc = X86ISD::SHLD;
28473   SDValue Op0 = N0.getOperand(0);
28474   SDValue Op1 = N1.getOperand(0);
28475   if (ShAmt0.getOpcode() == ISD::SUB) {
28476     Opc = X86ISD::SHRD;
28477     std::swap(Op0, Op1);
28478     std::swap(ShAmt0, ShAmt1);
28479   }
28480
28481   unsigned Bits = VT.getSizeInBits();
28482   if (ShAmt1.getOpcode() == ISD::SUB) {
28483     SDValue Sum = ShAmt1.getOperand(0);
28484     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
28485       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
28486       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
28487         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
28488       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
28489         return DAG.getNode(Opc, DL, VT,
28490                            Op0, Op1,
28491                            DAG.getNode(ISD::TRUNCATE, DL,
28492                                        MVT::i8, ShAmt0));
28493     }
28494   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
28495     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
28496     if (ShAmt0C &&
28497         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
28498       return DAG.getNode(Opc, DL, VT,
28499                          N0.getOperand(0), N1.getOperand(0),
28500                          DAG.getNode(ISD::TRUNCATE, DL,
28501                                        MVT::i8, ShAmt0));
28502   }
28503
28504   return SDValue();
28505 }
28506
28507 // Generate NEG and CMOV for integer abs.
28508 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
28509   EVT VT = N->getValueType(0);
28510
28511   // Since X86 does not have CMOV for 8-bit integer, we don't convert
28512   // 8-bit integer abs to NEG and CMOV.
28513   if (VT.isInteger() && VT.getSizeInBits() == 8)
28514     return SDValue();
28515
28516   SDValue N0 = N->getOperand(0);
28517   SDValue N1 = N->getOperand(1);
28518   SDLoc DL(N);
28519
28520   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
28521   // and change it to SUB and CMOV.
28522   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
28523       N0.getOpcode() == ISD::ADD &&
28524       N0.getOperand(1) == N1 &&
28525       N1.getOpcode() == ISD::SRA &&
28526       N1.getOperand(0) == N0.getOperand(0))
28527     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
28528       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
28529         // Generate SUB & CMOV.
28530         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28531                                   DAG.getConstant(0, DL, VT), N0.getOperand(0));
28532
28533         SDValue Ops[] = { N0.getOperand(0), Neg,
28534                           DAG.getConstant(X86::COND_GE, DL, MVT::i8),
28535                           SDValue(Neg.getNode(), 1) };
28536         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
28537       }
28538   return SDValue();
28539 }
28540
28541 /// Try to turn tests against the signbit in the form of:
28542 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
28543 /// into:
28544 ///   SETGT(X, -1)
28545 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
28546   // This is only worth doing if the output type is i8 or i1.
28547   EVT ResultType = N->getValueType(0);
28548   if (ResultType != MVT::i8 && ResultType != MVT::i1)
28549     return SDValue();
28550
28551   SDValue N0 = N->getOperand(0);
28552   SDValue N1 = N->getOperand(1);
28553
28554   // We should be performing an xor against a truncated shift.
28555   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
28556     return SDValue();
28557
28558   // Make sure we are performing an xor against one.
28559   if (!isOneConstant(N1))
28560     return SDValue();
28561
28562   // SetCC on x86 zero extends so only act on this if it's a logical shift.
28563   SDValue Shift = N0.getOperand(0);
28564   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
28565     return SDValue();
28566
28567   // Make sure we are truncating from one of i16, i32 or i64.
28568   EVT ShiftTy = Shift.getValueType();
28569   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
28570     return SDValue();
28571
28572   // Make sure the shift amount extracts the sign bit.
28573   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
28574       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
28575     return SDValue();
28576
28577   // Create a greater-than comparison against -1.
28578   // N.B. Using SETGE against 0 works but we want a canonical looking
28579   // comparison, using SETGT matches up with what TranslateX86CC.
28580   SDLoc DL(N);
28581   SDValue ShiftOp = Shift.getOperand(0);
28582   EVT ShiftOpTy = ShiftOp.getValueType();
28583   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28584   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
28585                                                *DAG.getContext(), ResultType);
28586   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
28587                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
28588   if (SetCCResultType != ResultType)
28589     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
28590   return Cond;
28591 }
28592
28593 /// Turn vector tests of the signbit in the form of:
28594 ///   xor (sra X, elt_size(X)-1), -1
28595 /// into:
28596 ///   pcmpgt X, -1
28597 ///
28598 /// This should be called before type legalization because the pattern may not
28599 /// persist after that.
28600 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
28601                                          const X86Subtarget &Subtarget) {
28602   EVT VT = N->getValueType(0);
28603   if (!VT.isSimple())
28604     return SDValue();
28605
28606   switch (VT.getSimpleVT().SimpleTy) {
28607   default: return SDValue();
28608   case MVT::v16i8:
28609   case MVT::v8i16:
28610   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
28611   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
28612   case MVT::v32i8:
28613   case MVT::v16i16:
28614   case MVT::v8i32:
28615   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
28616   }
28617
28618   // There must be a shift right algebraic before the xor, and the xor must be a
28619   // 'not' operation.
28620   SDValue Shift = N->getOperand(0);
28621   SDValue Ones = N->getOperand(1);
28622   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
28623       !ISD::isBuildVectorAllOnes(Ones.getNode()))
28624     return SDValue();
28625
28626   // The shift should be smearing the sign bit across each vector element.
28627   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
28628   if (!ShiftBV)
28629     return SDValue();
28630
28631   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
28632   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
28633   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
28634     return SDValue();
28635
28636   // Create a greater-than comparison against -1. We don't use the more obvious
28637   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
28638   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
28639 }
28640
28641 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
28642                                  TargetLowering::DAGCombinerInfo &DCI,
28643                                  const X86Subtarget &Subtarget) {
28644   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
28645     return Cmp;
28646
28647   if (DCI.isBeforeLegalizeOps())
28648     return SDValue();
28649
28650   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
28651     return RV;
28652
28653   if (Subtarget.hasCMov())
28654     if (SDValue RV = combineIntegerAbs(N, DAG))
28655       return RV;
28656
28657   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28658     return FPLogic;
28659
28660   return SDValue();
28661 }
28662
28663 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
28664 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
28665 /// X86ISD::AVG instruction.
28666 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
28667                                 const X86Subtarget &Subtarget,
28668                                 const SDLoc &DL) {
28669   if (!VT.isVector() || !VT.isSimple())
28670     return SDValue();
28671   EVT InVT = In.getValueType();
28672   unsigned NumElems = VT.getVectorNumElements();
28673
28674   EVT ScalarVT = VT.getVectorElementType();
28675   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
28676         isPowerOf2_32(NumElems)))
28677     return SDValue();
28678
28679   // InScalarVT is the intermediate type in AVG pattern and it should be greater
28680   // than the original input type (i8/i16).
28681   EVT InScalarVT = InVT.getVectorElementType();
28682   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
28683     return SDValue();
28684
28685   if (!Subtarget.hasSSE2())
28686     return SDValue();
28687   if (Subtarget.hasAVX512()) {
28688     if (VT.getSizeInBits() > 512)
28689       return SDValue();
28690   } else if (Subtarget.hasAVX2()) {
28691     if (VT.getSizeInBits() > 256)
28692       return SDValue();
28693   } else {
28694     if (VT.getSizeInBits() > 128)
28695       return SDValue();
28696   }
28697
28698   // Detect the following pattern:
28699   //
28700   //   %1 = zext <N x i8> %a to <N x i32>
28701   //   %2 = zext <N x i8> %b to <N x i32>
28702   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
28703   //   %4 = add nuw nsw <N x i32> %3, %2
28704   //   %5 = lshr <N x i32> %N, <i32 1 x N>
28705   //   %6 = trunc <N x i32> %5 to <N x i8>
28706   //
28707   // In AVX512, the last instruction can also be a trunc store.
28708
28709   if (In.getOpcode() != ISD::SRL)
28710     return SDValue();
28711
28712   // A lambda checking the given SDValue is a constant vector and each element
28713   // is in the range [Min, Max].
28714   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
28715     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
28716     if (!BV || !BV->isConstant())
28717       return false;
28718     for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
28719       ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
28720       if (!C)
28721         return false;
28722       uint64_t Val = C->getZExtValue();
28723       if (Val < Min || Val > Max)
28724         return false;
28725     }
28726     return true;
28727   };
28728
28729   // Check if each element of the vector is left-shifted by one.
28730   auto LHS = In.getOperand(0);
28731   auto RHS = In.getOperand(1);
28732   if (!IsConstVectorInRange(RHS, 1, 1))
28733     return SDValue();
28734   if (LHS.getOpcode() != ISD::ADD)
28735     return SDValue();
28736
28737   // Detect a pattern of a + b + 1 where the order doesn't matter.
28738   SDValue Operands[3];
28739   Operands[0] = LHS.getOperand(0);
28740   Operands[1] = LHS.getOperand(1);
28741
28742   // Take care of the case when one of the operands is a constant vector whose
28743   // element is in the range [1, 256].
28744   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
28745       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
28746       Operands[0].getOperand(0).getValueType() == VT) {
28747     // The pattern is detected. Subtract one from the constant vector, then
28748     // demote it and emit X86ISD::AVG instruction.
28749     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
28750     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
28751     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
28752     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28753                        Operands[1]);
28754   }
28755
28756   if (Operands[0].getOpcode() == ISD::ADD)
28757     std::swap(Operands[0], Operands[1]);
28758   else if (Operands[1].getOpcode() != ISD::ADD)
28759     return SDValue();
28760   Operands[2] = Operands[1].getOperand(0);
28761   Operands[1] = Operands[1].getOperand(1);
28762
28763   // Now we have three operands of two additions. Check that one of them is a
28764   // constant vector with ones, and the other two are promoted from i8/i16.
28765   for (int i = 0; i < 3; ++i) {
28766     if (!IsConstVectorInRange(Operands[i], 1, 1))
28767       continue;
28768     std::swap(Operands[i], Operands[2]);
28769
28770     // Check if Operands[0] and Operands[1] are results of type promotion.
28771     for (int j = 0; j < 2; ++j)
28772       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
28773           Operands[j].getOperand(0).getValueType() != VT)
28774         return SDValue();
28775
28776     // The pattern is detected, emit X86ISD::AVG instruction.
28777     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28778                        Operands[1].getOperand(0));
28779   }
28780
28781   return SDValue();
28782 }
28783
28784 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
28785                            TargetLowering::DAGCombinerInfo &DCI,
28786                            const X86Subtarget &Subtarget) {
28787   LoadSDNode *Ld = cast<LoadSDNode>(N);
28788   EVT RegVT = Ld->getValueType(0);
28789   EVT MemVT = Ld->getMemoryVT();
28790   SDLoc dl(Ld);
28791   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28792
28793   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
28794   // into two 16-byte operations.
28795   ISD::LoadExtType Ext = Ld->getExtensionType();
28796   bool Fast;
28797   unsigned AddressSpace = Ld->getAddressSpace();
28798   unsigned Alignment = Ld->getAlignment();
28799   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
28800       Ext == ISD::NON_EXTLOAD &&
28801       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
28802                              AddressSpace, Alignment, &Fast) && !Fast) {
28803     unsigned NumElems = RegVT.getVectorNumElements();
28804     if (NumElems < 2)
28805       return SDValue();
28806
28807     SDValue Ptr = Ld->getBasePtr();
28808
28809     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
28810                                   NumElems/2);
28811     SDValue Load1 =
28812         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
28813                     Alignment, Ld->getMemOperand()->getFlags());
28814
28815     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
28816     SDValue Load2 =
28817         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
28818                     std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
28819     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
28820                              Load1.getValue(1),
28821                              Load2.getValue(1));
28822
28823     SDValue NewVec = DAG.getUNDEF(RegVT);
28824     NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
28825     NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
28826     return DCI.CombineTo(N, NewVec, TF, true);
28827   }
28828
28829   return SDValue();
28830 }
28831
28832 /// If V is a build vector of boolean constants and exactly one of those
28833 /// constants is true, return the operand index of that true element.
28834 /// Otherwise, return -1.
28835 static int getOneTrueElt(SDValue V) {
28836   // This needs to be a build vector of booleans.
28837   // TODO: Checking for the i1 type matches the IR definition for the mask,
28838   // but the mask check could be loosened to i8 or other types. That might
28839   // also require checking more than 'allOnesValue'; eg, the x86 HW
28840   // instructions only require that the MSB is set for each mask element.
28841   // The ISD::MSTORE comments/definition do not specify how the mask operand
28842   // is formatted.
28843   auto *BV = dyn_cast<BuildVectorSDNode>(V);
28844   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
28845     return -1;
28846
28847   int TrueIndex = -1;
28848   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
28849   for (unsigned i = 0; i < NumElts; ++i) {
28850     const SDValue &Op = BV->getOperand(i);
28851     if (Op.isUndef())
28852       continue;
28853     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
28854     if (!ConstNode)
28855       return -1;
28856     if (ConstNode->getAPIntValue().isAllOnesValue()) {
28857       // If we already found a one, this is too many.
28858       if (TrueIndex >= 0)
28859         return -1;
28860       TrueIndex = i;
28861     }
28862   }
28863   return TrueIndex;
28864 }
28865
28866 /// Given a masked memory load/store operation, return true if it has one mask
28867 /// bit set. If it has one mask bit set, then also return the memory address of
28868 /// the scalar element to load/store, the vector index to insert/extract that
28869 /// scalar element, and the alignment for the scalar memory access.
28870 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
28871                                          SelectionDAG &DAG, SDValue &Addr,
28872                                          SDValue &Index, unsigned &Alignment) {
28873   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
28874   if (TrueMaskElt < 0)
28875     return false;
28876
28877   // Get the address of the one scalar element that is specified by the mask
28878   // using the appropriate offset from the base pointer.
28879   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
28880   Addr = MaskedOp->getBasePtr();
28881   if (TrueMaskElt != 0) {
28882     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
28883     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
28884   }
28885
28886   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
28887   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
28888   return true;
28889 }
28890
28891 /// If exactly one element of the mask is set for a non-extending masked load,
28892 /// it is a scalar load and vector insert.
28893 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
28894 /// mask have already been optimized in IR, so we don't bother with those here.
28895 static SDValue
28896 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28897                              TargetLowering::DAGCombinerInfo &DCI) {
28898   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
28899   // However, some target hooks may need to be added to know when the transform
28900   // is profitable. Endianness would also have to be considered.
28901
28902   SDValue Addr, VecIndex;
28903   unsigned Alignment;
28904   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
28905     return SDValue();
28906
28907   // Load the one scalar element that is specified by the mask using the
28908   // appropriate offset from the base pointer.
28909   SDLoc DL(ML);
28910   EVT VT = ML->getValueType(0);
28911   EVT EltVT = VT.getVectorElementType();
28912   SDValue Load =
28913       DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
28914                   Alignment, ML->getMemOperand()->getFlags());
28915
28916   // Insert the loaded element into the appropriate place in the vector.
28917   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
28918                                Load, VecIndex);
28919   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
28920 }
28921
28922 static SDValue
28923 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28924                               TargetLowering::DAGCombinerInfo &DCI) {
28925   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
28926     return SDValue();
28927
28928   SDLoc DL(ML);
28929   EVT VT = ML->getValueType(0);
28930
28931   // If we are loading the first and last elements of a vector, it is safe and
28932   // always faster to load the whole vector. Replace the masked load with a
28933   // vector load and select.
28934   unsigned NumElts = VT.getVectorNumElements();
28935   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
28936   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
28937   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
28938   if (LoadFirstElt && LoadLastElt) {
28939     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28940                                 ML->getMemOperand());
28941     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
28942     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
28943   }
28944
28945   // Convert a masked load with a constant mask into a masked load and a select.
28946   // This allows the select operation to use a faster kind of select instruction
28947   // (for example, vblendvps -> vblendps).
28948
28949   // Don't try this if the pass-through operand is already undefined. That would
28950   // cause an infinite loop because that's what we're about to create.
28951   if (ML->getSrc0().isUndef())
28952     return SDValue();
28953
28954   // The new masked load has an undef pass-through operand. The select uses the
28955   // original pass-through operand.
28956   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28957                                     ML->getMask(), DAG.getUNDEF(VT),
28958                                     ML->getMemoryVT(), ML->getMemOperand(),
28959                                     ML->getExtensionType());
28960   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
28961
28962   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
28963 }
28964
28965 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
28966                                  TargetLowering::DAGCombinerInfo &DCI,
28967                                  const X86Subtarget &Subtarget) {
28968   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
28969   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
28970     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
28971       return ScalarLoad;
28972     // TODO: Do some AVX512 subsets benefit from this transform?
28973     if (!Subtarget.hasAVX512())
28974       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
28975         return Blend;
28976   }
28977
28978   if (Mld->getExtensionType() != ISD::SEXTLOAD)
28979     return SDValue();
28980
28981   // Resolve extending loads.
28982   EVT VT = Mld->getValueType(0);
28983   unsigned NumElems = VT.getVectorNumElements();
28984   EVT LdVT = Mld->getMemoryVT();
28985   SDLoc dl(Mld);
28986
28987   assert(LdVT != VT && "Cannot extend to the same type");
28988   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
28989   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
28990   // From/To sizes and ElemCount must be pow of two.
28991   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
28992     "Unexpected size for extending masked load");
28993
28994   unsigned SizeRatio  = ToSz / FromSz;
28995   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
28996
28997   // Create a type on which we perform the shuffle.
28998   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
28999           LdVT.getScalarType(), NumElems*SizeRatio);
29000   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29001
29002   // Convert Src0 value.
29003   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
29004   if (!Mld->getSrc0().isUndef()) {
29005     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29006     for (unsigned i = 0; i != NumElems; ++i)
29007       ShuffleVec[i] = i * SizeRatio;
29008
29009     // Can't shuffle using an illegal type.
29010     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29011            "WideVecVT should be legal");
29012     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
29013                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
29014   }
29015   // Prepare the new mask.
29016   SDValue NewMask;
29017   SDValue Mask = Mld->getMask();
29018   if (Mask.getValueType() == VT) {
29019     // Mask and original value have the same type.
29020     NewMask = DAG.getBitcast(WideVecVT, Mask);
29021     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29022     for (unsigned i = 0; i != NumElems; ++i)
29023       ShuffleVec[i] = i * SizeRatio;
29024     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
29025       ShuffleVec[i] = NumElems * SizeRatio;
29026     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29027                                    DAG.getConstant(0, dl, WideVecVT),
29028                                    ShuffleVec);
29029   } else {
29030     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29031     unsigned WidenNumElts = NumElems*SizeRatio;
29032     unsigned MaskNumElts = VT.getVectorNumElements();
29033     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
29034                                      WidenNumElts);
29035
29036     unsigned NumConcat = WidenNumElts / MaskNumElts;
29037     SmallVector<SDValue, 16> Ops(NumConcat);
29038     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29039     Ops[0] = Mask;
29040     for (unsigned i = 1; i != NumConcat; ++i)
29041       Ops[i] = ZeroVal;
29042
29043     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29044   }
29045
29046   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
29047                                      Mld->getBasePtr(), NewMask, WideSrc0,
29048                                      Mld->getMemoryVT(), Mld->getMemOperand(),
29049                                      ISD::NON_EXTLOAD);
29050   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
29051   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
29052 }
29053
29054 /// If exactly one element of the mask is set for a non-truncating masked store,
29055 /// it is a vector extract and scalar store.
29056 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
29057 /// mask have already been optimized in IR, so we don't bother with those here.
29058 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
29059                                               SelectionDAG &DAG) {
29060   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
29061   // However, some target hooks may need to be added to know when the transform
29062   // is profitable. Endianness would also have to be considered.
29063
29064   SDValue Addr, VecIndex;
29065   unsigned Alignment;
29066   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
29067     return SDValue();
29068
29069   // Extract the one scalar element that is actually being stored.
29070   SDLoc DL(MS);
29071   EVT VT = MS->getValue().getValueType();
29072   EVT EltVT = VT.getVectorElementType();
29073   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
29074                                 MS->getValue(), VecIndex);
29075
29076   // Store that element at the appropriate offset from the base pointer.
29077   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
29078                       Alignment, MS->getMemOperand()->getFlags());
29079 }
29080
29081 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
29082                                   const X86Subtarget &Subtarget) {
29083   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
29084   if (!Mst->isTruncatingStore())
29085     return reduceMaskedStoreToScalarStore(Mst, DAG);
29086
29087   // Resolve truncating stores.
29088   EVT VT = Mst->getValue().getValueType();
29089   unsigned NumElems = VT.getVectorNumElements();
29090   EVT StVT = Mst->getMemoryVT();
29091   SDLoc dl(Mst);
29092
29093   assert(StVT != VT && "Cannot truncate to the same type");
29094   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29095   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29096
29097   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29098
29099   // The truncating store is legal in some cases. For example
29100   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29101   // are designated for truncate store.
29102   // In this case we don't need any further transformations.
29103   if (TLI.isTruncStoreLegal(VT, StVT))
29104     return SDValue();
29105
29106   // From/To sizes and ElemCount must be pow of two.
29107   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
29108     "Unexpected size for truncating masked store");
29109   // We are going to use the original vector elt for storing.
29110   // Accumulated smaller vector elements must be a multiple of the store size.
29111   assert (((NumElems * FromSz) % ToSz) == 0 &&
29112           "Unexpected ratio for truncating masked store");
29113
29114   unsigned SizeRatio  = FromSz / ToSz;
29115   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29116
29117   // Create a type on which we perform the shuffle.
29118   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29119           StVT.getScalarType(), NumElems*SizeRatio);
29120
29121   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29122
29123   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
29124   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29125   for (unsigned i = 0; i != NumElems; ++i)
29126     ShuffleVec[i] = i * SizeRatio;
29127
29128   // Can't shuffle using an illegal type.
29129   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29130          "WideVecVT should be legal");
29131
29132   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29133                                               DAG.getUNDEF(WideVecVT),
29134                                               ShuffleVec);
29135
29136   SDValue NewMask;
29137   SDValue Mask = Mst->getMask();
29138   if (Mask.getValueType() == VT) {
29139     // Mask and original value have the same type.
29140     NewMask = DAG.getBitcast(WideVecVT, Mask);
29141     for (unsigned i = 0; i != NumElems; ++i)
29142       ShuffleVec[i] = i * SizeRatio;
29143     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
29144       ShuffleVec[i] = NumElems*SizeRatio;
29145     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29146                                    DAG.getConstant(0, dl, WideVecVT),
29147                                    ShuffleVec);
29148   } else {
29149     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29150     unsigned WidenNumElts = NumElems*SizeRatio;
29151     unsigned MaskNumElts = VT.getVectorNumElements();
29152     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
29153                                      WidenNumElts);
29154
29155     unsigned NumConcat = WidenNumElts / MaskNumElts;
29156     SmallVector<SDValue, 16> Ops(NumConcat);
29157     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29158     Ops[0] = Mask;
29159     for (unsigned i = 1; i != NumConcat; ++i)
29160       Ops[i] = ZeroVal;
29161
29162     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29163   }
29164
29165   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
29166                             Mst->getBasePtr(), NewMask, StVT,
29167                             Mst->getMemOperand(), false);
29168 }
29169
29170 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
29171                             const X86Subtarget &Subtarget) {
29172   StoreSDNode *St = cast<StoreSDNode>(N);
29173   EVT VT = St->getValue().getValueType();
29174   EVT StVT = St->getMemoryVT();
29175   SDLoc dl(St);
29176   SDValue StoredVal = St->getOperand(1);
29177   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29178
29179   // If we are saving a concatenation of two XMM registers and 32-byte stores
29180   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
29181   bool Fast;
29182   unsigned AddressSpace = St->getAddressSpace();
29183   unsigned Alignment = St->getAlignment();
29184   if (VT.is256BitVector() && StVT == VT &&
29185       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
29186                              AddressSpace, Alignment, &Fast) &&
29187       !Fast) {
29188     unsigned NumElems = VT.getVectorNumElements();
29189     if (NumElems < 2)
29190       return SDValue();
29191
29192     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
29193     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
29194
29195     SDValue Ptr0 = St->getBasePtr();
29196     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
29197
29198     SDValue Ch0 =
29199         DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
29200                      Alignment, St->getMemOperand()->getFlags());
29201     SDValue Ch1 =
29202         DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
29203                      std::min(16U, Alignment), St->getMemOperand()->getFlags());
29204     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
29205   }
29206
29207   // Optimize trunc store (of multiple scalars) to shuffle and store.
29208   // First, pack all of the elements in one place. Next, store to memory
29209   // in fewer chunks.
29210   if (St->isTruncatingStore() && VT.isVector()) {
29211     // Check if we can detect an AVG pattern from the truncation. If yes,
29212     // replace the trunc store by a normal store with the result of X86ISD::AVG
29213     // instruction.
29214     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
29215                                        Subtarget, dl))
29216       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
29217                           St->getPointerInfo(), St->getAlignment(),
29218                           St->getMemOperand()->getFlags());
29219
29220     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29221     unsigned NumElems = VT.getVectorNumElements();
29222     assert(StVT != VT && "Cannot truncate to the same type");
29223     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29224     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29225
29226     // The truncating store is legal in some cases. For example
29227     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29228     // are designated for truncate store.
29229     // In this case we don't need any further transformations.
29230     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
29231       return SDValue();
29232
29233     // From, To sizes and ElemCount must be pow of two
29234     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
29235     // We are going to use the original vector elt for storing.
29236     // Accumulated smaller vector elements must be a multiple of the store size.
29237     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
29238
29239     unsigned SizeRatio  = FromSz / ToSz;
29240
29241     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29242
29243     // Create a type on which we perform the shuffle
29244     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29245             StVT.getScalarType(), NumElems*SizeRatio);
29246
29247     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29248
29249     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
29250     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
29251     for (unsigned i = 0; i != NumElems; ++i)
29252       ShuffleVec[i] = i * SizeRatio;
29253
29254     // Can't shuffle using an illegal type.
29255     if (!TLI.isTypeLegal(WideVecVT))
29256       return SDValue();
29257
29258     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29259                                          DAG.getUNDEF(WideVecVT),
29260                                          ShuffleVec);
29261     // At this point all of the data is stored at the bottom of the
29262     // register. We now need to save it to mem.
29263
29264     // Find the largest store unit
29265     MVT StoreType = MVT::i8;
29266     for (MVT Tp : MVT::integer_valuetypes()) {
29267       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
29268         StoreType = Tp;
29269     }
29270
29271     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
29272     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
29273         (64 <= NumElems * ToSz))
29274       StoreType = MVT::f64;
29275
29276     // Bitcast the original vector into a vector of store-size units
29277     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
29278             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
29279     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
29280     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
29281     SmallVector<SDValue, 8> Chains;
29282     SDValue Ptr = St->getBasePtr();
29283
29284     // Perform one or more big stores into memory.
29285     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
29286       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
29287                                    StoreType, ShuffWide,
29288                                    DAG.getIntPtrConstant(i, dl));
29289       SDValue Ch =
29290           DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
29291                        St->getAlignment(), St->getMemOperand()->getFlags());
29292       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
29293       Chains.push_back(Ch);
29294     }
29295
29296     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
29297   }
29298
29299   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
29300   // the FP state in cases where an emms may be missing.
29301   // A preferable solution to the general problem is to figure out the right
29302   // places to insert EMMS.  This qualifies as a quick hack.
29303
29304   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
29305   if (VT.getSizeInBits() != 64)
29306     return SDValue();
29307
29308   const Function *F = DAG.getMachineFunction().getFunction();
29309   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
29310   bool F64IsLegal =
29311       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
29312   if ((VT.isVector() ||
29313        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
29314       isa<LoadSDNode>(St->getValue()) &&
29315       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
29316       St->getChain().hasOneUse() && !St->isVolatile()) {
29317     SDNode* LdVal = St->getValue().getNode();
29318     LoadSDNode *Ld = nullptr;
29319     int TokenFactorIndex = -1;
29320     SmallVector<SDValue, 8> Ops;
29321     SDNode* ChainVal = St->getChain().getNode();
29322     // Must be a store of a load.  We currently handle two cases:  the load
29323     // is a direct child, and it's under an intervening TokenFactor.  It is
29324     // possible to dig deeper under nested TokenFactors.
29325     if (ChainVal == LdVal)
29326       Ld = cast<LoadSDNode>(St->getChain());
29327     else if (St->getValue().hasOneUse() &&
29328              ChainVal->getOpcode() == ISD::TokenFactor) {
29329       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
29330         if (ChainVal->getOperand(i).getNode() == LdVal) {
29331           TokenFactorIndex = i;
29332           Ld = cast<LoadSDNode>(St->getValue());
29333         } else
29334           Ops.push_back(ChainVal->getOperand(i));
29335       }
29336     }
29337
29338     if (!Ld || !ISD::isNormalLoad(Ld))
29339       return SDValue();
29340
29341     // If this is not the MMX case, i.e. we are just turning i64 load/store
29342     // into f64 load/store, avoid the transformation if there are multiple
29343     // uses of the loaded value.
29344     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
29345       return SDValue();
29346
29347     SDLoc LdDL(Ld);
29348     SDLoc StDL(N);
29349     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
29350     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
29351     // pair instead.
29352     if (Subtarget.is64Bit() || F64IsLegal) {
29353       MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
29354       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
29355                                   Ld->getPointerInfo(), Ld->getAlignment(),
29356                                   Ld->getMemOperand()->getFlags());
29357       SDValue NewChain = NewLd.getValue(1);
29358       if (TokenFactorIndex >= 0) {
29359         Ops.push_back(NewChain);
29360         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29361       }
29362       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
29363                           St->getPointerInfo(), St->getAlignment(),
29364                           St->getMemOperand()->getFlags());
29365     }
29366
29367     // Otherwise, lower to two pairs of 32-bit loads / stores.
29368     SDValue LoAddr = Ld->getBasePtr();
29369     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
29370
29371     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
29372                                Ld->getPointerInfo(), Ld->getAlignment(),
29373                                Ld->getMemOperand()->getFlags());
29374     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
29375                                Ld->getPointerInfo().getWithOffset(4),
29376                                MinAlign(Ld->getAlignment(), 4),
29377                                Ld->getMemOperand()->getFlags());
29378
29379     SDValue NewChain = LoLd.getValue(1);
29380     if (TokenFactorIndex >= 0) {
29381       Ops.push_back(LoLd);
29382       Ops.push_back(HiLd);
29383       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29384     }
29385
29386     LoAddr = St->getBasePtr();
29387     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
29388
29389     SDValue LoSt =
29390         DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
29391                      St->getAlignment(), St->getMemOperand()->getFlags());
29392     SDValue HiSt = DAG.getStore(
29393         NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
29394         MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
29395     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
29396   }
29397
29398   // This is similar to the above case, but here we handle a scalar 64-bit
29399   // integer store that is extracted from a vector on a 32-bit target.
29400   // If we have SSE2, then we can treat it like a floating-point double
29401   // to get past legalization. The execution dependencies fixup pass will
29402   // choose the optimal machine instruction for the store if this really is
29403   // an integer or v2f32 rather than an f64.
29404   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
29405       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
29406     SDValue OldExtract = St->getOperand(1);
29407     SDValue ExtOp0 = OldExtract.getOperand(0);
29408     unsigned VecSize = ExtOp0.getValueSizeInBits();
29409     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
29410     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
29411     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
29412                                      BitCast, OldExtract.getOperand(1));
29413     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
29414                         St->getPointerInfo(), St->getAlignment(),
29415                         St->getMemOperand()->getFlags());
29416   }
29417
29418   return SDValue();
29419 }
29420
29421 /// Return 'true' if this vector operation is "horizontal"
29422 /// and return the operands for the horizontal operation in LHS and RHS.  A
29423 /// horizontal operation performs the binary operation on successive elements
29424 /// of its first operand, then on successive elements of its second operand,
29425 /// returning the resulting values in a vector.  For example, if
29426 ///   A = < float a0, float a1, float a2, float a3 >
29427 /// and
29428 ///   B = < float b0, float b1, float b2, float b3 >
29429 /// then the result of doing a horizontal operation on A and B is
29430 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
29431 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
29432 /// A horizontal-op B, for some already available A and B, and if so then LHS is
29433 /// set to A, RHS to B, and the routine returns 'true'.
29434 /// Note that the binary operation should have the property that if one of the
29435 /// operands is UNDEF then the result is UNDEF.
29436 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
29437   // Look for the following pattern: if
29438   //   A = < float a0, float a1, float a2, float a3 >
29439   //   B = < float b0, float b1, float b2, float b3 >
29440   // and
29441   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
29442   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
29443   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
29444   // which is A horizontal-op B.
29445
29446   // At least one of the operands should be a vector shuffle.
29447   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
29448       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
29449     return false;
29450
29451   MVT VT = LHS.getSimpleValueType();
29452
29453   assert((VT.is128BitVector() || VT.is256BitVector()) &&
29454          "Unsupported vector type for horizontal add/sub");
29455
29456   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
29457   // operate independently on 128-bit lanes.
29458   unsigned NumElts = VT.getVectorNumElements();
29459   unsigned NumLanes = VT.getSizeInBits()/128;
29460   unsigned NumLaneElts = NumElts / NumLanes;
29461   assert((NumLaneElts % 2 == 0) &&
29462          "Vector type should have an even number of elements in each lane");
29463   unsigned HalfLaneElts = NumLaneElts/2;
29464
29465   // View LHS in the form
29466   //   LHS = VECTOR_SHUFFLE A, B, LMask
29467   // If LHS is not a shuffle then pretend it is the shuffle
29468   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
29469   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
29470   // type VT.
29471   SDValue A, B;
29472   SmallVector<int, 16> LMask(NumElts);
29473   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29474     if (!LHS.getOperand(0).isUndef())
29475       A = LHS.getOperand(0);
29476     if (!LHS.getOperand(1).isUndef())
29477       B = LHS.getOperand(1);
29478     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
29479     std::copy(Mask.begin(), Mask.end(), LMask.begin());
29480   } else {
29481     if (!LHS.isUndef())
29482       A = LHS;
29483     for (unsigned i = 0; i != NumElts; ++i)
29484       LMask[i] = i;
29485   }
29486
29487   // Likewise, view RHS in the form
29488   //   RHS = VECTOR_SHUFFLE C, D, RMask
29489   SDValue C, D;
29490   SmallVector<int, 16> RMask(NumElts);
29491   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29492     if (!RHS.getOperand(0).isUndef())
29493       C = RHS.getOperand(0);
29494     if (!RHS.getOperand(1).isUndef())
29495       D = RHS.getOperand(1);
29496     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
29497     std::copy(Mask.begin(), Mask.end(), RMask.begin());
29498   } else {
29499     if (!RHS.isUndef())
29500       C = RHS;
29501     for (unsigned i = 0; i != NumElts; ++i)
29502       RMask[i] = i;
29503   }
29504
29505   // Check that the shuffles are both shuffling the same vectors.
29506   if (!(A == C && B == D) && !(A == D && B == C))
29507     return false;
29508
29509   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
29510   if (!A.getNode() && !B.getNode())
29511     return false;
29512
29513   // If A and B occur in reverse order in RHS, then "swap" them (which means
29514   // rewriting the mask).
29515   if (A != C)
29516     ShuffleVectorSDNode::commuteMask(RMask);
29517
29518   // At this point LHS and RHS are equivalent to
29519   //   LHS = VECTOR_SHUFFLE A, B, LMask
29520   //   RHS = VECTOR_SHUFFLE A, B, RMask
29521   // Check that the masks correspond to performing a horizontal operation.
29522   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
29523     for (unsigned i = 0; i != NumLaneElts; ++i) {
29524       int LIdx = LMask[i+l], RIdx = RMask[i+l];
29525
29526       // Ignore any UNDEF components.
29527       if (LIdx < 0 || RIdx < 0 ||
29528           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
29529           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
29530         continue;
29531
29532       // Check that successive elements are being operated on.  If not, this is
29533       // not a horizontal operation.
29534       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
29535       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
29536       if (!(LIdx == Index && RIdx == Index + 1) &&
29537           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
29538         return false;
29539     }
29540   }
29541
29542   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
29543   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
29544   return true;
29545 }
29546
29547 /// Do target-specific dag combines on floating-point adds/subs.
29548 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
29549                                const X86Subtarget &Subtarget) {
29550   EVT VT = N->getValueType(0);
29551   SDValue LHS = N->getOperand(0);
29552   SDValue RHS = N->getOperand(1);
29553   bool IsFadd = N->getOpcode() == ISD::FADD;
29554   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
29555
29556   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
29557   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
29558        (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
29559       isHorizontalBinOp(LHS, RHS, IsFadd)) {
29560     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
29561     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
29562   }
29563   return SDValue();
29564 }
29565
29566 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
29567 static SDValue
29568 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
29569                                   SmallVector<SDValue, 8> &Regs) {
29570   assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
29571                              Regs[0].getValueType() == MVT::v2i64));
29572   EVT OutVT = N->getValueType(0);
29573   EVT OutSVT = OutVT.getVectorElementType();
29574   EVT InVT = Regs[0].getValueType();
29575   EVT InSVT = InVT.getVectorElementType();
29576   SDLoc DL(N);
29577
29578   // First, use mask to unset all bits that won't appear in the result.
29579   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
29580          "OutSVT can only be either i8 or i16.");
29581   APInt Mask =
29582       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
29583   SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
29584   for (auto &Reg : Regs)
29585     Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
29586
29587   MVT UnpackedVT, PackedVT;
29588   if (OutSVT == MVT::i8) {
29589     UnpackedVT = MVT::v8i16;
29590     PackedVT = MVT::v16i8;
29591   } else {
29592     UnpackedVT = MVT::v4i32;
29593     PackedVT = MVT::v8i16;
29594   }
29595
29596   // In each iteration, truncate the type by a half size.
29597   auto RegNum = Regs.size();
29598   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
29599        j < e; j *= 2, RegNum /= 2) {
29600     for (unsigned i = 0; i < RegNum; i++)
29601       Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
29602     for (unsigned i = 0; i < RegNum / 2; i++)
29603       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
29604                             Regs[i * 2 + 1]);
29605   }
29606
29607   // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
29608   // then extract a subvector as the result since v8i8 is not a legal type.
29609   if (OutVT == MVT::v8i8) {
29610     Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
29611     Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
29612                           DAG.getIntPtrConstant(0, DL));
29613     return Regs[0];
29614   } else if (RegNum > 1) {
29615     Regs.resize(RegNum);
29616     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29617   } else
29618     return Regs[0];
29619 }
29620
29621 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
29622 static SDValue
29623 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
29624                                   SmallVector<SDValue, 8> &Regs) {
29625   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
29626   EVT OutVT = N->getValueType(0);
29627   SDLoc DL(N);
29628
29629   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
29630   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
29631   for (auto &Reg : Regs) {
29632     Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29633     Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29634   }
29635
29636   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
29637     Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
29638                           Regs[i * 2 + 1]);
29639
29640   if (Regs.size() > 2) {
29641     Regs.resize(Regs.size() / 2);
29642     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29643   } else
29644     return Regs[0];
29645 }
29646
29647 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
29648 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
29649 /// legalization the truncation will be translated into a BUILD_VECTOR with each
29650 /// element that is extracted from a vector and then truncated, and it is
29651 /// diffcult to do this optimization based on them.
29652 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
29653                                        const X86Subtarget &Subtarget) {
29654   EVT OutVT = N->getValueType(0);
29655   if (!OutVT.isVector())
29656     return SDValue();
29657
29658   SDValue In = N->getOperand(0);
29659   if (!In.getValueType().isSimple())
29660     return SDValue();
29661
29662   EVT InVT = In.getValueType();
29663   unsigned NumElems = OutVT.getVectorNumElements();
29664
29665   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
29666   // SSE2, and we need to take care of it specially.
29667   // AVX512 provides vpmovdb.
29668   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
29669     return SDValue();
29670
29671   EVT OutSVT = OutVT.getVectorElementType();
29672   EVT InSVT = InVT.getVectorElementType();
29673   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
29674         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
29675         NumElems >= 8))
29676     return SDValue();
29677
29678   // SSSE3's pshufb results in less instructions in the cases below.
29679   if (Subtarget.hasSSSE3() && NumElems == 8 &&
29680       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
29681        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
29682     return SDValue();
29683
29684   SDLoc DL(N);
29685
29686   // Split a long vector into vectors of legal type.
29687   unsigned RegNum = InVT.getSizeInBits() / 128;
29688   SmallVector<SDValue, 8> SubVec(RegNum);
29689   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
29690   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
29691
29692   for (unsigned i = 0; i < RegNum; i++)
29693     SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
29694                             DAG.getIntPtrConstant(i * NumSubRegElts, DL));
29695
29696   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
29697   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
29698   // truncate 2 x v4i32 to v8i16.
29699   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
29700     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
29701   else if (InSVT == MVT::i32)
29702     return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
29703   else
29704     return SDValue();
29705 }
29706
29707 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
29708                                const X86Subtarget &Subtarget) {
29709   EVT VT = N->getValueType(0);
29710   SDValue Src = N->getOperand(0);
29711   SDLoc DL(N);
29712
29713   // Try to detect AVG pattern first.
29714   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
29715     return Avg;
29716
29717   // The bitcast source is a direct mmx result.
29718   // Detect bitcasts between i32 to x86mmx
29719   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
29720     SDValue BCSrc = Src.getOperand(0);
29721     if (BCSrc.getValueType() == MVT::x86mmx)
29722       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
29723   }
29724
29725   return combineVectorTruncation(N, DAG, Subtarget);
29726 }
29727
29728 /// Do target-specific dag combines on floating point negations.
29729 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
29730                            const X86Subtarget &Subtarget) {
29731   EVT VT = N->getValueType(0);
29732   EVT SVT = VT.getScalarType();
29733   SDValue Arg = N->getOperand(0);
29734   SDLoc DL(N);
29735
29736   // Let legalize expand this if it isn't a legal type yet.
29737   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
29738     return SDValue();
29739
29740   // If we're negating a FMUL node on a target with FMA, then we can avoid the
29741   // use of a constant by performing (-0 - A*B) instead.
29742   // FIXME: Check rounding control flags as well once it becomes available.
29743   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
29744       Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
29745     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
29746     return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29747                        Arg.getOperand(1), Zero);
29748   }
29749
29750   // If we're negating a FMA node, then we can adjust the
29751   // instruction to include the extra negation.
29752   if (Arg.hasOneUse()) {
29753     switch (Arg.getOpcode()) {
29754     case X86ISD::FMADD:
29755       return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29756                          Arg.getOperand(1), Arg.getOperand(2));
29757     case X86ISD::FMSUB:
29758       return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
29759                          Arg.getOperand(1), Arg.getOperand(2));
29760     case X86ISD::FNMADD:
29761       return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
29762                          Arg.getOperand(1), Arg.getOperand(2));
29763     case X86ISD::FNMSUB:
29764       return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
29765                          Arg.getOperand(1), Arg.getOperand(2));
29766     }
29767   }
29768   return SDValue();
29769 }
29770
29771 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
29772                               const X86Subtarget &Subtarget) {
29773   EVT VT = N->getValueType(0);
29774   if (VT.is512BitVector() && !Subtarget.hasDQI()) {
29775     // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
29776     // These logic operations may be executed in the integer domain.
29777     SDLoc dl(N);
29778     MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
29779     MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
29780
29781     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
29782     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
29783     unsigned IntOpcode = 0;
29784     switch (N->getOpcode()) {
29785       default: llvm_unreachable("Unexpected FP logic op");
29786       case X86ISD::FOR: IntOpcode = ISD::OR; break;
29787       case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
29788       case X86ISD::FAND: IntOpcode = ISD::AND; break;
29789       case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
29790     }
29791     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
29792     return DAG.getBitcast(VT, IntOp);
29793   }
29794   return SDValue();
29795 }
29796 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
29797 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
29798                           const X86Subtarget &Subtarget) {
29799   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
29800
29801   // F[X]OR(0.0, x) -> x
29802   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29803     if (C->getValueAPF().isPosZero())
29804       return N->getOperand(1);
29805
29806   // F[X]OR(x, 0.0) -> x
29807   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29808     if (C->getValueAPF().isPosZero())
29809       return N->getOperand(0);
29810
29811   return lowerX86FPLogicOp(N, DAG, Subtarget);
29812 }
29813
29814 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
29815 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
29816   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
29817
29818   // Only perform optimizations if UnsafeMath is used.
29819   if (!DAG.getTarget().Options.UnsafeFPMath)
29820     return SDValue();
29821
29822   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
29823   // into FMINC and FMAXC, which are Commutative operations.
29824   unsigned NewOp = 0;
29825   switch (N->getOpcode()) {
29826     default: llvm_unreachable("unknown opcode");
29827     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
29828     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
29829   }
29830
29831   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
29832                      N->getOperand(0), N->getOperand(1));
29833 }
29834
29835 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
29836                                      const X86Subtarget &Subtarget) {
29837   if (Subtarget.useSoftFloat())
29838     return SDValue();
29839
29840   // TODO: Check for global or instruction-level "nnan". In that case, we
29841   //       should be able to lower to FMAX/FMIN alone.
29842   // TODO: If an operand is already known to be a NaN or not a NaN, this
29843   //       should be an optional swap and FMAX/FMIN.
29844
29845   EVT VT = N->getValueType(0);
29846   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
29847         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
29848         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
29849     return SDValue();
29850
29851   // This takes at least 3 instructions, so favor a library call when operating
29852   // on a scalar and minimizing code size.
29853   if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
29854     return SDValue();
29855
29856   SDValue Op0 = N->getOperand(0);
29857   SDValue Op1 = N->getOperand(1);
29858   SDLoc DL(N);
29859   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
29860       DAG.getDataLayout(), *DAG.getContext(), VT);
29861
29862   // There are 4 possibilities involving NaN inputs, and these are the required
29863   // outputs:
29864   //                   Op1
29865   //               Num     NaN
29866   //            ----------------
29867   //       Num  |  Max  |  Op0 |
29868   // Op0        ----------------
29869   //       NaN  |  Op1  |  NaN |
29870   //            ----------------
29871   //
29872   // The SSE FP max/min instructions were not designed for this case, but rather
29873   // to implement:
29874   //   Min = Op1 < Op0 ? Op1 : Op0
29875   //   Max = Op1 > Op0 ? Op1 : Op0
29876   //
29877   // So they always return Op0 if either input is a NaN. However, we can still
29878   // use those instructions for fmaxnum by selecting away a NaN input.
29879
29880   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
29881   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
29882   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
29883   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
29884
29885   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
29886   // are NaN, the NaN value of Op1 is the result.
29887   auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
29888   return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
29889 }
29890
29891 /// Do target-specific dag combines on X86ISD::FAND nodes.
29892 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
29893                            const X86Subtarget &Subtarget) {
29894   // FAND(0.0, x) -> 0.0
29895   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29896     if (C->getValueAPF().isPosZero())
29897       return N->getOperand(0);
29898
29899   // FAND(x, 0.0) -> 0.0
29900   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29901     if (C->getValueAPF().isPosZero())
29902       return N->getOperand(1);
29903
29904   return lowerX86FPLogicOp(N, DAG, Subtarget);
29905 }
29906
29907 /// Do target-specific dag combines on X86ISD::FANDN nodes
29908 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
29909                             const X86Subtarget &Subtarget) {
29910   // FANDN(0.0, x) -> x
29911   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29912     if (C->getValueAPF().isPosZero())
29913       return N->getOperand(1);
29914
29915   // FANDN(x, 0.0) -> 0.0
29916   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29917     if (C->getValueAPF().isPosZero())
29918       return N->getOperand(1);
29919
29920   return lowerX86FPLogicOp(N, DAG, Subtarget);
29921 }
29922
29923 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
29924                          TargetLowering::DAGCombinerInfo &DCI) {
29925   // BT ignores high bits in the bit index operand.
29926   SDValue Op1 = N->getOperand(1);
29927   if (Op1.hasOneUse()) {
29928     unsigned BitWidth = Op1.getValueSizeInBits();
29929     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
29930     APInt KnownZero, KnownOne;
29931     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
29932                                           !DCI.isBeforeLegalizeOps());
29933     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29934     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
29935         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
29936       DCI.CommitTargetLoweringOpt(TLO);
29937   }
29938   return SDValue();
29939 }
29940
29941 static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
29942   SDValue Op = peekThroughBitcasts(N->getOperand(0));
29943   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
29944   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
29945       VT.getVectorElementType().getSizeInBits() ==
29946       OpVT.getVectorElementType().getSizeInBits()) {
29947     return DAG.getBitcast(VT, Op);
29948   }
29949   return SDValue();
29950 }
29951
29952 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
29953                                       const X86Subtarget &Subtarget) {
29954   EVT VT = N->getValueType(0);
29955   if (!VT.isVector())
29956     return SDValue();
29957
29958   SDValue N0 = N->getOperand(0);
29959   SDValue N1 = N->getOperand(1);
29960   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
29961   SDLoc dl(N);
29962
29963   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
29964   // both SSE and AVX2 since there is no sign-extended shift right
29965   // operation on a vector with 64-bit elements.
29966   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
29967   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
29968   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
29969       N0.getOpcode() == ISD::SIGN_EXTEND)) {
29970     SDValue N00 = N0.getOperand(0);
29971
29972     // EXTLOAD has a better solution on AVX2,
29973     // it may be replaced with X86ISD::VSEXT node.
29974     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
29975       if (!ISD::isNormalLoad(N00.getNode()))
29976         return SDValue();
29977
29978     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
29979         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
29980                                   N00, N1);
29981       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
29982     }
29983   }
29984   return SDValue();
29985 }
29986
29987 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
29988 /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
29989 /// to combine math ops, use an LEA, or use a complex addressing mode. This can
29990 /// eliminate extend, add, and shift instructions.
29991 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
29992                                        const X86Subtarget &Subtarget) {
29993   // TODO: This should be valid for other integer types.
29994   EVT VT = Sext->getValueType(0);
29995   if (VT != MVT::i64)
29996     return SDValue();
29997
29998   // We need an 'add nsw' feeding into the 'sext'.
29999   SDValue Add = Sext->getOperand(0);
30000   if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
30001     return SDValue();
30002
30003   // Having a constant operand to the 'add' ensures that we are not increasing
30004   // the instruction count because the constant is extended for free below.
30005   // A constant operand can also become the displacement field of an LEA.
30006   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
30007   if (!AddOp1)
30008     return SDValue();
30009
30010   // Don't make the 'add' bigger if there's no hope of combining it with some
30011   // other 'add' or 'shl' instruction.
30012   // TODO: It may be profitable to generate simpler LEA instructions in place
30013   // of single 'add' instructions, but the cost model for selecting an LEA
30014   // currently has a high threshold.
30015   bool HasLEAPotential = false;
30016   for (auto *User : Sext->uses()) {
30017     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
30018       HasLEAPotential = true;
30019       break;
30020     }
30021   }
30022   if (!HasLEAPotential)
30023     return SDValue();
30024
30025   // Everything looks good, so pull the 'sext' ahead of the 'add'.
30026   int64_t AddConstant = AddOp1->getSExtValue();
30027   SDValue AddOp0 = Add.getOperand(0);
30028   SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
30029   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
30030
30031   // The wider add is guaranteed to not wrap because both operands are
30032   // sign-extended.
30033   SDNodeFlags Flags;
30034   Flags.setNoSignedWrap(true);
30035   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
30036 }
30037
30038 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
30039 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
30040 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
30041 /// extends from AH (which we otherwise need to do contortions to access).
30042 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
30043   SDValue N0 = N->getOperand(0);
30044   auto OpcodeN = N->getOpcode();
30045   auto OpcodeN0 = N0.getOpcode();
30046   if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
30047         (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
30048     return SDValue();
30049
30050   EVT VT = N->getValueType(0);
30051   EVT InVT = N0.getValueType();
30052   if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
30053     return SDValue();
30054
30055   SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
30056   auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
30057                                                : X86ISD::UDIVREM8_ZEXT_HREG;
30058   SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
30059                           N0.getOperand(1));
30060   DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
30061   return R.getValue(1);
30062 }
30063
30064 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
30065 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
30066 /// with UNDEFs) of the input to vectors of the same size as the target type
30067 /// which then extends the lowest elements.
30068 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
30069                                           TargetLowering::DAGCombinerInfo &DCI,
30070                                           const X86Subtarget &Subtarget) {
30071   unsigned Opcode = N->getOpcode();
30072   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
30073     return SDValue();
30074   if (!DCI.isBeforeLegalizeOps())
30075     return SDValue();
30076   if (!Subtarget.hasSSE2())
30077     return SDValue();
30078
30079   SDValue N0 = N->getOperand(0);
30080   EVT VT = N->getValueType(0);
30081   EVT SVT = VT.getScalarType();
30082   EVT InVT = N0.getValueType();
30083   EVT InSVT = InVT.getScalarType();
30084
30085   // Input type must be a vector and we must be extending legal integer types.
30086   if (!VT.isVector())
30087     return SDValue();
30088   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
30089     return SDValue();
30090   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
30091     return SDValue();
30092
30093   // On AVX2+ targets, if the input/output types are both legal then we will be
30094   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
30095   if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
30096       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
30097     return SDValue();
30098
30099   SDLoc DL(N);
30100
30101   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
30102     EVT InVT = N.getValueType();
30103     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
30104                                  Size / InVT.getScalarSizeInBits());
30105     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
30106                                   DAG.getUNDEF(InVT));
30107     Opnds[0] = N;
30108     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
30109   };
30110
30111   // If target-size is less than 128-bits, extend to a type that would extend
30112   // to 128 bits, extend that and extract the original target vector.
30113   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
30114     unsigned Scale = 128 / VT.getSizeInBits();
30115     EVT ExVT =
30116         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
30117     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
30118     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
30119     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
30120                        DAG.getIntPtrConstant(0, DL));
30121   }
30122
30123   // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
30124   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
30125   // Also use this if we don't have SSE41 to allow the legalizer do its job.
30126   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
30127       (VT.is256BitVector() && Subtarget.hasInt256())) {
30128     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
30129     return Opcode == ISD::SIGN_EXTEND
30130                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
30131                : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
30132   }
30133
30134   // On pre-AVX2 targets, split into 128-bit nodes of
30135   // ISD::*_EXTEND_VECTOR_INREG.
30136   if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
30137     unsigned NumVecs = VT.getSizeInBits() / 128;
30138     unsigned NumSubElts = 128 / SVT.getSizeInBits();
30139     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
30140     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
30141
30142     SmallVector<SDValue, 8> Opnds;
30143     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
30144       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
30145                                    DAG.getIntPtrConstant(Offset, DL));
30146       SrcVec = ExtendVecSize(DL, SrcVec, 128);
30147       SrcVec = Opcode == ISD::SIGN_EXTEND
30148                    ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
30149                    : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
30150       Opnds.push_back(SrcVec);
30151     }
30152     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
30153   }
30154
30155   return SDValue();
30156 }
30157
30158 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
30159                            TargetLowering::DAGCombinerInfo &DCI,
30160                            const X86Subtarget &Subtarget) {
30161   SDValue N0 = N->getOperand(0);
30162   EVT VT = N->getValueType(0);
30163   EVT InVT = N0.getValueType();
30164   SDLoc DL(N);
30165
30166   if (SDValue DivRem8 = getDivRem8(N, DAG))
30167     return DivRem8;
30168
30169   if (!DCI.isBeforeLegalizeOps()) {
30170     if (InVT == MVT::i1) {
30171       SDValue Zero = DAG.getConstant(0, DL, VT);
30172       SDValue AllOnes =
30173           DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
30174       return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
30175     }
30176     return SDValue();
30177   }
30178
30179   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30180     return V;
30181
30182   if (Subtarget.hasAVX() && VT.is256BitVector())
30183     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30184       return R;
30185
30186   if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
30187     return NewAdd;
30188
30189   return SDValue();
30190 }
30191
30192 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
30193                           const X86Subtarget &Subtarget) {
30194   SDLoc dl(N);
30195   EVT VT = N->getValueType(0);
30196
30197   // Let legalize expand this if it isn't a legal type yet.
30198   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30199     return SDValue();
30200
30201   EVT ScalarVT = VT.getScalarType();
30202   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
30203     return SDValue();
30204
30205   SDValue A = N->getOperand(0);
30206   SDValue B = N->getOperand(1);
30207   SDValue C = N->getOperand(2);
30208
30209   bool NegA = (A.getOpcode() == ISD::FNEG);
30210   bool NegB = (B.getOpcode() == ISD::FNEG);
30211   bool NegC = (C.getOpcode() == ISD::FNEG);
30212
30213   // Negative multiplication when NegA xor NegB
30214   bool NegMul = (NegA != NegB);
30215   if (NegA)
30216     A = A.getOperand(0);
30217   if (NegB)
30218     B = B.getOperand(0);
30219   if (NegC)
30220     C = C.getOperand(0);
30221
30222   unsigned Opcode;
30223   if (!NegMul)
30224     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
30225   else
30226     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
30227
30228   return DAG.getNode(Opcode, dl, VT, A, B, C);
30229 }
30230
30231 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
30232                            TargetLowering::DAGCombinerInfo &DCI,
30233                            const X86Subtarget &Subtarget) {
30234   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
30235   //           (and (i32 x86isd::setcc_carry), 1)
30236   // This eliminates the zext. This transformation is necessary because
30237   // ISD::SETCC is always legalized to i8.
30238   SDLoc dl(N);
30239   SDValue N0 = N->getOperand(0);
30240   EVT VT = N->getValueType(0);
30241
30242   if (N0.getOpcode() == ISD::AND &&
30243       N0.hasOneUse() &&
30244       N0.getOperand(0).hasOneUse()) {
30245     SDValue N00 = N0.getOperand(0);
30246     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30247       if (!isOneConstant(N0.getOperand(1)))
30248         return SDValue();
30249       return DAG.getNode(ISD::AND, dl, VT,
30250                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30251                                      N00.getOperand(0), N00.getOperand(1)),
30252                          DAG.getConstant(1, dl, VT));
30253     }
30254   }
30255
30256   if (N0.getOpcode() == ISD::TRUNCATE &&
30257       N0.hasOneUse() &&
30258       N0.getOperand(0).hasOneUse()) {
30259     SDValue N00 = N0.getOperand(0);
30260     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30261       return DAG.getNode(ISD::AND, dl, VT,
30262                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30263                                      N00.getOperand(0), N00.getOperand(1)),
30264                          DAG.getConstant(1, dl, VT));
30265     }
30266   }
30267
30268   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30269     return V;
30270
30271   if (VT.is256BitVector())
30272     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30273       return R;
30274
30275   if (SDValue DivRem8 = getDivRem8(N, DAG))
30276     return DivRem8;
30277
30278   return SDValue();
30279 }
30280
30281 /// Optimize x == -y --> x+y == 0
30282 ///          x != -y --> x+y != 0
30283 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
30284                             const X86Subtarget &Subtarget) {
30285   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
30286   SDValue LHS = N->getOperand(0);
30287   SDValue RHS = N->getOperand(1);
30288   EVT VT = N->getValueType(0);
30289   SDLoc DL(N);
30290
30291   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
30292     if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
30293       SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
30294                                  LHS.getOperand(1));
30295       return DAG.getSetCC(DL, N->getValueType(0), addV,
30296                           DAG.getConstant(0, DL, addV.getValueType()), CC);
30297     }
30298   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
30299     if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
30300       SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
30301                                  RHS.getOperand(1));
30302       return DAG.getSetCC(DL, N->getValueType(0), addV,
30303                           DAG.getConstant(0, DL, addV.getValueType()), CC);
30304     }
30305
30306   if (VT.getScalarType() == MVT::i1 &&
30307       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
30308     bool IsSEXT0 =
30309         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30310         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30311     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30312
30313     if (!IsSEXT0 || !IsVZero1) {
30314       // Swap the operands and update the condition code.
30315       std::swap(LHS, RHS);
30316       CC = ISD::getSetCCSwappedOperands(CC);
30317
30318       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30319                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30320       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30321     }
30322
30323     if (IsSEXT0 && IsVZero1) {
30324       assert(VT == LHS.getOperand(0).getValueType() &&
30325              "Uexpected operand type");
30326       if (CC == ISD::SETGT)
30327         return DAG.getConstant(0, DL, VT);
30328       if (CC == ISD::SETLE)
30329         return DAG.getConstant(1, DL, VT);
30330       if (CC == ISD::SETEQ || CC == ISD::SETGE)
30331         return DAG.getNOT(DL, LHS.getOperand(0), VT);
30332
30333       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
30334              "Unexpected condition code!");
30335       return LHS.getOperand(0);
30336     }
30337   }
30338
30339   // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
30340   // via legalization because v4i32 is not a legal type.
30341   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
30342     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
30343
30344   return SDValue();
30345 }
30346
30347 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
30348   SDLoc DL(N);
30349   // Gather and Scatter instructions use k-registers for masks. The type of
30350   // the masks is v*i1. So the mask will be truncated anyway.
30351   // The SIGN_EXTEND_INREG my be dropped.
30352   SDValue Mask = N->getOperand(2);
30353   if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
30354     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
30355     NewOps[2] = Mask.getOperand(0);
30356     DAG.UpdateNodeOperands(N, NewOps);
30357   }
30358   return SDValue();
30359 }
30360
30361 // Helper function of performSETCCCombine. It is to materialize "setb reg"
30362 // as "sbb reg,reg", since it can be extended without zext and produces
30363 // an all-ones bit which is more useful than 0/1 in some cases.
30364 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
30365                                SelectionDAG &DAG, MVT VT) {
30366   if (VT == MVT::i8)
30367     return DAG.getNode(ISD::AND, DL, VT,
30368                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30369                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
30370                                    EFLAGS),
30371                        DAG.getConstant(1, DL, VT));
30372   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
30373   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
30374                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30375                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
30376                                  EFLAGS));
30377 }
30378
30379 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
30380 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
30381                                TargetLowering::DAGCombinerInfo &DCI,
30382                                const X86Subtarget &Subtarget) {
30383   SDLoc DL(N);
30384   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
30385   SDValue EFLAGS = N->getOperand(1);
30386
30387   if (CC == X86::COND_A) {
30388     // Try to convert COND_A into COND_B in an attempt to facilitate
30389     // materializing "setb reg".
30390     //
30391     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
30392     // cannot take an immediate as its first operand.
30393     //
30394     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
30395         EFLAGS.getValueType().isInteger() &&
30396         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
30397       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
30398                                    EFLAGS.getNode()->getVTList(),
30399                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
30400       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
30401       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
30402     }
30403   }
30404
30405   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
30406   // a zext and produces an all-ones bit which is more useful than 0/1 in some
30407   // cases.
30408   if (CC == X86::COND_B)
30409     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
30410
30411   // Try to simplify the EFLAGS and condition code operands.
30412   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30413     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30414     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
30415   }
30416
30417   return SDValue();
30418 }
30419
30420 /// Optimize branch condition evaluation.
30421 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
30422                              TargetLowering::DAGCombinerInfo &DCI,
30423                              const X86Subtarget &Subtarget) {
30424   SDLoc DL(N);
30425   SDValue EFLAGS = N->getOperand(3);
30426   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
30427
30428   // Try to simplify the EFLAGS and condition code operands.
30429   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
30430   // RAUW them under us.
30431   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30432     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30433     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
30434                        N->getOperand(1), Cond, Flags);
30435   }
30436
30437   return SDValue();
30438 }
30439
30440 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
30441                                                   SelectionDAG &DAG) {
30442   // Take advantage of vector comparisons producing 0 or -1 in each lane to
30443   // optimize away operation when it's from a constant.
30444   //
30445   // The general transformation is:
30446   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
30447   //       AND(VECTOR_CMP(x,y), constant2)
30448   //    constant2 = UNARYOP(constant)
30449
30450   // Early exit if this isn't a vector operation, the operand of the
30451   // unary operation isn't a bitwise AND, or if the sizes of the operations
30452   // aren't the same.
30453   EVT VT = N->getValueType(0);
30454   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
30455       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
30456       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
30457     return SDValue();
30458
30459   // Now check that the other operand of the AND is a constant. We could
30460   // make the transformation for non-constant splats as well, but it's unclear
30461   // that would be a benefit as it would not eliminate any operations, just
30462   // perform one more step in scalar code before moving to the vector unit.
30463   if (BuildVectorSDNode *BV =
30464           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
30465     // Bail out if the vector isn't a constant.
30466     if (!BV->isConstant())
30467       return SDValue();
30468
30469     // Everything checks out. Build up the new and improved node.
30470     SDLoc DL(N);
30471     EVT IntVT = BV->getValueType(0);
30472     // Create a new constant of the appropriate type for the transformed
30473     // DAG.
30474     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
30475     // The AND node needs bitcasts to/from an integer vector type around it.
30476     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
30477     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
30478                                  N->getOperand(0)->getOperand(0), MaskConst);
30479     SDValue Res = DAG.getBitcast(VT, NewAnd);
30480     return Res;
30481   }
30482
30483   return SDValue();
30484 }
30485
30486 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
30487                                const X86Subtarget &Subtarget) {
30488   SDValue Op0 = N->getOperand(0);
30489   EVT VT = N->getValueType(0);
30490   EVT InVT = Op0.getValueType();
30491   EVT InSVT = InVT.getScalarType();
30492   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30493
30494   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
30495   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
30496   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30497     SDLoc dl(N);
30498     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30499                                  InVT.getVectorNumElements());
30500     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
30501
30502     if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
30503       return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
30504
30505     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30506   }
30507
30508   return SDValue();
30509 }
30510
30511 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
30512                                const X86Subtarget &Subtarget) {
30513   // First try to optimize away the conversion entirely when it's
30514   // conditionally from a constant. Vectors only.
30515   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
30516     return Res;
30517
30518   // Now move on to more general possibilities.
30519   SDValue Op0 = N->getOperand(0);
30520   EVT VT = N->getValueType(0);
30521   EVT InVT = Op0.getValueType();
30522   EVT InSVT = InVT.getScalarType();
30523
30524   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
30525   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
30526   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30527     SDLoc dl(N);
30528     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30529                                  InVT.getVectorNumElements());
30530     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
30531     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30532   }
30533
30534   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
30535   // a 32-bit target where SSE doesn't support i64->FP operations.
30536   if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
30537     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
30538     EVT LdVT = Ld->getValueType(0);
30539
30540     // This transformation is not supported if the result type is f16 or f128.
30541     if (VT == MVT::f16 || VT == MVT::f128)
30542       return SDValue();
30543
30544     if (!Ld->isVolatile() && !VT.isVector() &&
30545         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
30546         !Subtarget.is64Bit() && LdVT == MVT::i64) {
30547       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
30548           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
30549       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
30550       return FILDChain;
30551     }
30552   }
30553   return SDValue();
30554 }
30555
30556 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
30557 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
30558                           X86TargetLowering::DAGCombinerInfo &DCI) {
30559   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
30560   // the result is either zero or one (depending on the input carry bit).
30561   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
30562   if (X86::isZeroNode(N->getOperand(0)) &&
30563       X86::isZeroNode(N->getOperand(1)) &&
30564       // We don't have a good way to replace an EFLAGS use, so only do this when
30565       // dead right now.
30566       SDValue(N, 1).use_empty()) {
30567     SDLoc DL(N);
30568     EVT VT = N->getValueType(0);
30569     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
30570     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
30571                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
30572                                            DAG.getConstant(X86::COND_B, DL,
30573                                                            MVT::i8),
30574                                            N->getOperand(2)),
30575                                DAG.getConstant(1, DL, VT));
30576     return DCI.CombineTo(N, Res1, CarryOut);
30577   }
30578
30579   return SDValue();
30580 }
30581
30582 /// fold (add Y, (sete  X, 0)) -> adc  0, Y
30583 ///      (add Y, (setne X, 0)) -> sbb -1, Y
30584 ///      (sub (sete  X, 0), Y) -> sbb  0, Y
30585 ///      (sub (setne X, 0), Y) -> adc -1, Y
30586 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
30587   SDLoc DL(N);
30588
30589   // Look through ZExts.
30590   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
30591   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
30592     return SDValue();
30593
30594   SDValue SetCC = Ext.getOperand(0);
30595   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
30596     return SDValue();
30597
30598   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
30599   if (CC != X86::COND_E && CC != X86::COND_NE)
30600     return SDValue();
30601
30602   SDValue Cmp = SetCC.getOperand(1);
30603   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
30604       !X86::isZeroNode(Cmp.getOperand(1)) ||
30605       !Cmp.getOperand(0).getValueType().isInteger())
30606     return SDValue();
30607
30608   SDValue CmpOp0 = Cmp.getOperand(0);
30609   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
30610                                DAG.getConstant(1, DL, CmpOp0.getValueType()));
30611
30612   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
30613   if (CC == X86::COND_NE)
30614     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
30615                        DL, OtherVal.getValueType(), OtherVal,
30616                        DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
30617                        NewCmp);
30618   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
30619                      DL, OtherVal.getValueType(), OtherVal,
30620                      DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
30621 }
30622
30623 static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
30624                                 const X86Subtarget &Subtarget) {
30625   SDLoc DL(N);
30626   EVT VT = N->getValueType(0);
30627   SDValue Op0 = N->getOperand(0);
30628   SDValue Op1 = N->getOperand(1);
30629
30630   if (!VT.isVector() || !VT.isSimple() ||
30631       !(VT.getVectorElementType() == MVT::i32))
30632     return SDValue();
30633
30634   unsigned RegSize = 128;
30635   if (Subtarget.hasBWI())
30636     RegSize = 512;
30637   else if (Subtarget.hasAVX2())
30638     RegSize = 256;
30639
30640   // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
30641   if (VT.getSizeInBits() / 4 > RegSize)
30642     return SDValue();
30643
30644   // Detect the following pattern:
30645   //
30646   // 1:    %2 = zext <N x i8> %0 to <N x i32>
30647   // 2:    %3 = zext <N x i8> %1 to <N x i32>
30648   // 3:    %4 = sub nsw <N x i32> %2, %3
30649   // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30650   // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
30651   // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30652   // 7:    %8 = add nsw <N x i32> %7, %vec.phi
30653   //
30654   // The last instruction must be a reduction add. The instructions 3-6 forms an
30655   // ABSDIFF pattern.
30656
30657   // The two operands of reduction add are from PHI and a select-op as in line 7
30658   // above.
30659   SDValue SelectOp, Phi;
30660   if (Op0.getOpcode() == ISD::VSELECT) {
30661     SelectOp = Op0;
30662     Phi = Op1;
30663   } else if (Op1.getOpcode() == ISD::VSELECT) {
30664     SelectOp = Op1;
30665     Phi = Op0;
30666   } else
30667     return SDValue();
30668
30669   // Check the condition of the select instruction is greater-than.
30670   SDValue SetCC = SelectOp->getOperand(0);
30671   if (SetCC.getOpcode() != ISD::SETCC)
30672     return SDValue();
30673   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30674   if (CC != ISD::SETGT)
30675     return SDValue();
30676
30677   Op0 = SelectOp->getOperand(1);
30678   Op1 = SelectOp->getOperand(2);
30679
30680   // The second operand of SelectOp Op1 is the negation of the first operand
30681   // Op0, which is implemented as 0 - Op0.
30682   if (!(Op1.getOpcode() == ISD::SUB &&
30683         ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
30684         Op1.getOperand(1) == Op0))
30685     return SDValue();
30686
30687   // The first operand of SetCC is the first operand of SelectOp, which is the
30688   // difference between two input vectors.
30689   if (SetCC.getOperand(0) != Op0)
30690     return SDValue();
30691
30692   // The second operand of > comparison can be either -1 or 0.
30693   if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30694         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30695     return SDValue();
30696
30697   // The first operand of SelectOp is the difference between two input vectors.
30698   if (Op0.getOpcode() != ISD::SUB)
30699     return SDValue();
30700
30701   Op1 = Op0.getOperand(1);
30702   Op0 = Op0.getOperand(0);
30703
30704   // Check if the operands of the diff are zero-extended from vectors of i8.
30705   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30706       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30707       Op1.getOpcode() != ISD::ZERO_EXTEND ||
30708       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30709     return SDValue();
30710
30711   // SAD pattern detected. Now build a SAD instruction and an addition for
30712   // reduction. Note that the number of elments of the result of SAD is less
30713   // than the number of elements of its input. Therefore, we could only update
30714   // part of elements in the reduction vector.
30715
30716   // Legalize the type of the inputs of PSADBW.
30717   EVT InVT = Op0.getOperand(0).getValueType();
30718   if (InVT.getSizeInBits() <= 128)
30719     RegSize = 128;
30720   else if (InVT.getSizeInBits() <= 256)
30721     RegSize = 256;
30722
30723   unsigned NumConcat = RegSize / InVT.getSizeInBits();
30724   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30725   Ops[0] = Op0.getOperand(0);
30726   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30727   Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30728   Ops[0] = Op1.getOperand(0);
30729   Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30730
30731   // The output of PSADBW is a vector of i64.
30732   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30733   SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
30734
30735   // We need to turn the vector of i64 into a vector of i32.
30736   // If the reduction vector is at least as wide as the psadbw result, just
30737   // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
30738   // anyway.
30739   MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30740   if (VT.getSizeInBits() >= ResVT.getSizeInBits())
30741     Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
30742   else
30743     Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
30744
30745   if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
30746     // Update part of elements of the reduction vector. This is done by first
30747     // extracting a sub-vector from it, updating this sub-vector, and inserting
30748     // it back.
30749     SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
30750                                  DAG.getIntPtrConstant(0, DL));
30751     SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
30752     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
30753                        DAG.getIntPtrConstant(0, DL));
30754   } else
30755     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
30756 }
30757
30758 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
30759                           const X86Subtarget &Subtarget) {
30760   const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
30761   if (Flags->hasVectorReduction()) {
30762     if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
30763       return Sad;
30764   }
30765   EVT VT = N->getValueType(0);
30766   SDValue Op0 = N->getOperand(0);
30767   SDValue Op1 = N->getOperand(1);
30768
30769   // Try to synthesize horizontal adds from adds of shuffles.
30770   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30771        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30772       isHorizontalBinOp(Op0, Op1, true))
30773     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
30774
30775   return OptimizeConditionalInDecrement(N, DAG);
30776 }
30777
30778 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
30779                           const X86Subtarget &Subtarget) {
30780   SDValue Op0 = N->getOperand(0);
30781   SDValue Op1 = N->getOperand(1);
30782
30783   // X86 can't encode an immediate LHS of a sub. See if we can push the
30784   // negation into a preceding instruction.
30785   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
30786     // If the RHS of the sub is a XOR with one use and a constant, invert the
30787     // immediate. Then add one to the LHS of the sub so we can turn
30788     // X-Y -> X+~Y+1, saving one register.
30789     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
30790         isa<ConstantSDNode>(Op1.getOperand(1))) {
30791       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
30792       EVT VT = Op0.getValueType();
30793       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
30794                                    Op1.getOperand(0),
30795                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
30796       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
30797                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
30798     }
30799   }
30800
30801   // Try to synthesize horizontal adds from adds of shuffles.
30802   EVT VT = N->getValueType(0);
30803   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30804        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30805       isHorizontalBinOp(Op0, Op1, true))
30806     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
30807
30808   return OptimizeConditionalInDecrement(N, DAG);
30809 }
30810
30811 static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
30812                             TargetLowering::DAGCombinerInfo &DCI,
30813                             const X86Subtarget &Subtarget) {
30814   SDLoc DL(N);
30815   MVT VT = N->getSimpleValueType(0);
30816   MVT SVT = VT.getVectorElementType();
30817   SDValue Op = N->getOperand(0);
30818   MVT OpVT = Op.getSimpleValueType();
30819   MVT OpEltVT = OpVT.getVectorElementType();
30820   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
30821
30822   // Perform any constant folding.
30823   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
30824     SmallVector<SDValue, 4> Vals;
30825     for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30826       SDValue OpElt = Op.getOperand(i);
30827       if (OpElt.getOpcode() == ISD::UNDEF) {
30828         Vals.push_back(DAG.getUNDEF(SVT));
30829         continue;
30830       }
30831       APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
30832       assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
30833       Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
30834       Vals.push_back(DAG.getConstant(Cst, DL, SVT));
30835     }
30836     return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
30837   }
30838
30839   // (vzext (bitcast (vzext (x)) -> (vzext x)
30840   SDValue V = peekThroughBitcasts(Op);
30841   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
30842     MVT InnerVT = V.getSimpleValueType();
30843     MVT InnerEltVT = InnerVT.getVectorElementType();
30844
30845     // If the element sizes match exactly, we can just do one larger vzext. This
30846     // is always an exact type match as vzext operates on integer types.
30847     if (OpEltVT == InnerEltVT) {
30848       assert(OpVT == InnerVT && "Types must match for vzext!");
30849       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
30850     }
30851
30852     // The only other way we can combine them is if only a single element of the
30853     // inner vzext is used in the input to the outer vzext.
30854     if (InnerEltVT.getSizeInBits() < InputBits)
30855       return SDValue();
30856
30857     // In this case, the inner vzext is completely dead because we're going to
30858     // only look at bits inside of the low element. Just do the outer vzext on
30859     // a bitcast of the input to the inner.
30860     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
30861   }
30862
30863   // Check if we can bypass extracting and re-inserting an element of an input
30864   // vector. Essentially:
30865   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
30866   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
30867       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
30868       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
30869     SDValue ExtractedV = V.getOperand(0);
30870     SDValue OrigV = ExtractedV.getOperand(0);
30871     if (isNullConstant(ExtractedV.getOperand(1))) {
30872         MVT OrigVT = OrigV.getSimpleValueType();
30873         // Extract a subvector if necessary...
30874         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
30875           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
30876           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
30877                                     OrigVT.getVectorNumElements() / Ratio);
30878           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
30879                               DAG.getIntPtrConstant(0, DL));
30880         }
30881         Op = DAG.getBitcast(OpVT, OrigV);
30882         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
30883       }
30884   }
30885
30886   return SDValue();
30887 }
30888
30889 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
30890 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
30891                                   const X86Subtarget &Subtarget) {
30892   SDValue Chain = N->getOperand(0);
30893   SDValue LHS = N->getOperand(1);
30894   SDValue RHS = N->getOperand(2);
30895   MVT VT = RHS.getSimpleValueType();
30896   SDLoc DL(N);
30897
30898   auto *C = dyn_cast<ConstantSDNode>(RHS);
30899   if (!C || C->getZExtValue() != 1)
30900     return SDValue();
30901
30902   RHS = DAG.getConstant(-1, DL, VT);
30903   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
30904   return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
30905                                  DAG.getVTList(MVT::i32, MVT::Other),
30906                                  {Chain, LHS, RHS}, VT, MMO);
30907 }
30908
30909 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
30910 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
30911   SDValue Op0 = N->getOperand(0);
30912   SDValue Op1 = N->getOperand(1);
30913
30914   if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
30915     return SDValue();
30916
30917   EVT VT = N->getValueType(0);
30918   SDLoc DL(N);
30919
30920   return DAG.getNode(X86ISD::TESTM, DL, VT,
30921                      Op0->getOperand(0), Op0->getOperand(1));
30922 }
30923
30924 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
30925                                     const X86Subtarget &Subtarget) {
30926   MVT VT = N->getSimpleValueType(0);
30927   SDLoc DL(N);
30928
30929   if (N->getOperand(0) == N->getOperand(1)) {
30930     if (N->getOpcode() == X86ISD::PCMPEQ)
30931       return getOnesVector(VT, Subtarget, DAG, DL);
30932     if (N->getOpcode() == X86ISD::PCMPGT)
30933       return getZeroVector(VT, Subtarget, DAG, DL);
30934   }
30935
30936   return SDValue();
30937 }
30938
30939
30940 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
30941                                              DAGCombinerInfo &DCI) const {
30942   SelectionDAG &DAG = DCI.DAG;
30943   switch (N->getOpcode()) {
30944   default: break;
30945   case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
30946   case ISD::VSELECT:
30947   case ISD::SELECT:
30948   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
30949   case ISD::BITCAST:        return combineBitcast(N, DAG, Subtarget);
30950   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
30951   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
30952   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
30953   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
30954   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
30955   case ISD::SHL:
30956   case ISD::SRA:
30957   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
30958   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
30959   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
30960   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
30961   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
30962   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
30963   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
30964   case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
30965   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
30966   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
30967   case ISD::FADD:
30968   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
30969   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
30970   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
30971   case X86ISD::FXOR:
30972   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
30973   case X86ISD::FMIN:
30974   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
30975   case ISD::FMINNUM:
30976   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
30977   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
30978   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
30979   case X86ISD::BT:          return combineBT(N, DAG, DCI);
30980   case X86ISD::VZEXT_MOVL:  return combineVZextMovl(N, DAG);
30981   case ISD::ANY_EXTEND:
30982   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
30983   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
30984   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
30985   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
30986   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
30987   case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
30988   case X86ISD::VZEXT:       return combineVZext(N, DAG, DCI, Subtarget);
30989   case X86ISD::SHUFP:       // Handle all target specific shuffles
30990   case X86ISD::INSERTPS:
30991   case X86ISD::PALIGNR:
30992   case X86ISD::VSHLDQ:
30993   case X86ISD::VSRLDQ:
30994   case X86ISD::BLENDI:
30995   case X86ISD::UNPCKH:
30996   case X86ISD::UNPCKL:
30997   case X86ISD::MOVHLPS:
30998   case X86ISD::MOVLHPS:
30999   case X86ISD::PSHUFB:
31000   case X86ISD::PSHUFD:
31001   case X86ISD::PSHUFHW:
31002   case X86ISD::PSHUFLW:
31003   case X86ISD::MOVSHDUP:
31004   case X86ISD::MOVSLDUP:
31005   case X86ISD::MOVDDUP:
31006   case X86ISD::MOVSS:
31007   case X86ISD::MOVSD:
31008   case X86ISD::VPPERM:
31009   case X86ISD::VPERMI:
31010   case X86ISD::VPERMV:
31011   case X86ISD::VPERMV3:
31012   case X86ISD::VPERMIL2:
31013   case X86ISD::VPERMILPI:
31014   case X86ISD::VPERMILPV:
31015   case X86ISD::VPERM2X128:
31016   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
31017   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
31018   case ISD::MGATHER:
31019   case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
31020   case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
31021   case X86ISD::TESTM:       return combineTestM(N, DAG);
31022   case X86ISD::PCMPEQ:
31023   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
31024   }
31025
31026   return SDValue();
31027 }
31028
31029 /// Return true if the target has native support for the specified value type
31030 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
31031 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
31032 /// some i16 instructions are slow.
31033 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
31034   if (!isTypeLegal(VT))
31035     return false;
31036   if (VT != MVT::i16)
31037     return true;
31038
31039   switch (Opc) {
31040   default:
31041     return true;
31042   case ISD::LOAD:
31043   case ISD::SIGN_EXTEND:
31044   case ISD::ZERO_EXTEND:
31045   case ISD::ANY_EXTEND:
31046   case ISD::SHL:
31047   case ISD::SRL:
31048   case ISD::SUB:
31049   case ISD::ADD:
31050   case ISD::MUL:
31051   case ISD::AND:
31052   case ISD::OR:
31053   case ISD::XOR:
31054     return false;
31055   }
31056 }
31057
31058 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
31059 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
31060 /// we don't adjust the stack we clobber the first frame index.
31061 /// See X86InstrInfo::copyPhysReg.
31062 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
31063     MachineFunction *MF) const {
31064   const MachineRegisterInfo &MRI = MF->getRegInfo();
31065
31066   return any_of(MRI.reg_instructions(X86::EFLAGS),
31067                 [](const MachineInstr &RI) { return RI.isCopy(); });
31068 }
31069
31070 /// This method query the target whether it is beneficial for dag combiner to
31071 /// promote the specified node. If true, it should return the desired promotion
31072 /// type by reference.
31073 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
31074   EVT VT = Op.getValueType();
31075   if (VT != MVT::i16)
31076     return false;
31077
31078   bool Promote = false;
31079   bool Commute = false;
31080   switch (Op.getOpcode()) {
31081   default: break;
31082   case ISD::SIGN_EXTEND:
31083   case ISD::ZERO_EXTEND:
31084   case ISD::ANY_EXTEND:
31085     Promote = true;
31086     break;
31087   case ISD::SHL:
31088   case ISD::SRL: {
31089     SDValue N0 = Op.getOperand(0);
31090     // Look out for (store (shl (load), x)).
31091     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
31092       return false;
31093     Promote = true;
31094     break;
31095   }
31096   case ISD::ADD:
31097   case ISD::MUL:
31098   case ISD::AND:
31099   case ISD::OR:
31100   case ISD::XOR:
31101     Commute = true;
31102     // fallthrough
31103   case ISD::SUB: {
31104     SDValue N0 = Op.getOperand(0);
31105     SDValue N1 = Op.getOperand(1);
31106     if (!Commute && MayFoldLoad(N1))
31107       return false;
31108     // Avoid disabling potential load folding opportunities.
31109     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
31110       return false;
31111     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
31112       return false;
31113     Promote = true;
31114   }
31115   }
31116
31117   PVT = MVT::i32;
31118   return Promote;
31119 }
31120
31121 //===----------------------------------------------------------------------===//
31122 //                           X86 Inline Assembly Support
31123 //===----------------------------------------------------------------------===//
31124
31125 // Helper to match a string separated by whitespace.
31126 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
31127   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
31128
31129   for (StringRef Piece : Pieces) {
31130     if (!S.startswith(Piece)) // Check if the piece matches.
31131       return false;
31132
31133     S = S.substr(Piece.size());
31134     StringRef::size_type Pos = S.find_first_not_of(" \t");
31135     if (Pos == 0) // We matched a prefix.
31136       return false;
31137
31138     S = S.substr(Pos);
31139   }
31140
31141   return S.empty();
31142 }
31143
31144 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
31145
31146   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
31147     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
31148         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
31149         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
31150
31151       if (AsmPieces.size() == 3)
31152         return true;
31153       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
31154         return true;
31155     }
31156   }
31157   return false;
31158 }
31159
31160 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
31161   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
31162
31163   const std::string &AsmStr = IA->getAsmString();
31164
31165   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
31166   if (!Ty || Ty->getBitWidth() % 16 != 0)
31167     return false;
31168
31169   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
31170   SmallVector<StringRef, 4> AsmPieces;
31171   SplitString(AsmStr, AsmPieces, ";\n");
31172
31173   switch (AsmPieces.size()) {
31174   default: return false;
31175   case 1:
31176     // FIXME: this should verify that we are targeting a 486 or better.  If not,
31177     // we will turn this bswap into something that will be lowered to logical
31178     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
31179     // lower so don't worry about this.
31180     // bswap $0
31181     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
31182         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
31183         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
31184         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
31185         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
31186         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
31187       // No need to check constraints, nothing other than the equivalent of
31188       // "=r,0" would be valid here.
31189       return IntrinsicLowering::LowerToByteSwap(CI);
31190     }
31191
31192     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
31193     if (CI->getType()->isIntegerTy(16) &&
31194         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31195         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
31196          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
31197       AsmPieces.clear();
31198       StringRef ConstraintsStr = IA->getConstraintString();
31199       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31200       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31201       if (clobbersFlagRegisters(AsmPieces))
31202         return IntrinsicLowering::LowerToByteSwap(CI);
31203     }
31204     break;
31205   case 3:
31206     if (CI->getType()->isIntegerTy(32) &&
31207         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31208         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
31209         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
31210         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
31211       AsmPieces.clear();
31212       StringRef ConstraintsStr = IA->getConstraintString();
31213       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31214       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31215       if (clobbersFlagRegisters(AsmPieces))
31216         return IntrinsicLowering::LowerToByteSwap(CI);
31217     }
31218
31219     if (CI->getType()->isIntegerTy(64)) {
31220       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
31221       if (Constraints.size() >= 2 &&
31222           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
31223           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
31224         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
31225         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
31226             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
31227             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
31228           return IntrinsicLowering::LowerToByteSwap(CI);
31229       }
31230     }
31231     break;
31232   }
31233   return false;
31234 }
31235
31236 /// Given a constraint letter, return the type of constraint for this target.
31237 X86TargetLowering::ConstraintType
31238 X86TargetLowering::getConstraintType(StringRef Constraint) const {
31239   if (Constraint.size() == 1) {
31240     switch (Constraint[0]) {
31241     case 'R':
31242     case 'q':
31243     case 'Q':
31244     case 'f':
31245     case 't':
31246     case 'u':
31247     case 'y':
31248     case 'x':
31249     case 'Y':
31250     case 'l':
31251       return C_RegisterClass;
31252     case 'a':
31253     case 'b':
31254     case 'c':
31255     case 'd':
31256     case 'S':
31257     case 'D':
31258     case 'A':
31259       return C_Register;
31260     case 'I':
31261     case 'J':
31262     case 'K':
31263     case 'L':
31264     case 'M':
31265     case 'N':
31266     case 'G':
31267     case 'C':
31268     case 'e':
31269     case 'Z':
31270       return C_Other;
31271     default:
31272       break;
31273     }
31274   }
31275   return TargetLowering::getConstraintType(Constraint);
31276 }
31277
31278 /// Examine constraint type and operand type and determine a weight value.
31279 /// This object must already have been set up with the operand type
31280 /// and the current alternative constraint selected.
31281 TargetLowering::ConstraintWeight
31282   X86TargetLowering::getSingleConstraintMatchWeight(
31283     AsmOperandInfo &info, const char *constraint) const {
31284   ConstraintWeight weight = CW_Invalid;
31285   Value *CallOperandVal = info.CallOperandVal;
31286     // If we don't have a value, we can't do a match,
31287     // but allow it at the lowest weight.
31288   if (!CallOperandVal)
31289     return CW_Default;
31290   Type *type = CallOperandVal->getType();
31291   // Look at the constraint type.
31292   switch (*constraint) {
31293   default:
31294     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
31295   case 'R':
31296   case 'q':
31297   case 'Q':
31298   case 'a':
31299   case 'b':
31300   case 'c':
31301   case 'd':
31302   case 'S':
31303   case 'D':
31304   case 'A':
31305     if (CallOperandVal->getType()->isIntegerTy())
31306       weight = CW_SpecificReg;
31307     break;
31308   case 'f':
31309   case 't':
31310   case 'u':
31311     if (type->isFloatingPointTy())
31312       weight = CW_SpecificReg;
31313     break;
31314   case 'y':
31315     if (type->isX86_MMXTy() && Subtarget.hasMMX())
31316       weight = CW_SpecificReg;
31317     break;
31318   case 'x':
31319   case 'Y':
31320     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
31321         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
31322       weight = CW_Register;
31323     break;
31324   case 'I':
31325     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
31326       if (C->getZExtValue() <= 31)
31327         weight = CW_Constant;
31328     }
31329     break;
31330   case 'J':
31331     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31332       if (C->getZExtValue() <= 63)
31333         weight = CW_Constant;
31334     }
31335     break;
31336   case 'K':
31337     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31338       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
31339         weight = CW_Constant;
31340     }
31341     break;
31342   case 'L':
31343     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31344       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
31345         weight = CW_Constant;
31346     }
31347     break;
31348   case 'M':
31349     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31350       if (C->getZExtValue() <= 3)
31351         weight = CW_Constant;
31352     }
31353     break;
31354   case 'N':
31355     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31356       if (C->getZExtValue() <= 0xff)
31357         weight = CW_Constant;
31358     }
31359     break;
31360   case 'G':
31361   case 'C':
31362     if (isa<ConstantFP>(CallOperandVal)) {
31363       weight = CW_Constant;
31364     }
31365     break;
31366   case 'e':
31367     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31368       if ((C->getSExtValue() >= -0x80000000LL) &&
31369           (C->getSExtValue() <= 0x7fffffffLL))
31370         weight = CW_Constant;
31371     }
31372     break;
31373   case 'Z':
31374     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31375       if (C->getZExtValue() <= 0xffffffff)
31376         weight = CW_Constant;
31377     }
31378     break;
31379   }
31380   return weight;
31381 }
31382
31383 /// Try to replace an X constraint, which matches anything, with another that
31384 /// has more specific requirements based on the type of the corresponding
31385 /// operand.
31386 const char *X86TargetLowering::
31387 LowerXConstraint(EVT ConstraintVT) const {
31388   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
31389   // 'f' like normal targets.
31390   if (ConstraintVT.isFloatingPoint()) {
31391     if (Subtarget.hasSSE2())
31392       return "Y";
31393     if (Subtarget.hasSSE1())
31394       return "x";
31395   }
31396
31397   return TargetLowering::LowerXConstraint(ConstraintVT);
31398 }
31399
31400 /// Lower the specified operand into the Ops vector.
31401 /// If it is invalid, don't add anything to Ops.
31402 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
31403                                                      std::string &Constraint,
31404                                                      std::vector<SDValue>&Ops,
31405                                                      SelectionDAG &DAG) const {
31406   SDValue Result;
31407
31408   // Only support length 1 constraints for now.
31409   if (Constraint.length() > 1) return;
31410
31411   char ConstraintLetter = Constraint[0];
31412   switch (ConstraintLetter) {
31413   default: break;
31414   case 'I':
31415     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31416       if (C->getZExtValue() <= 31) {
31417         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31418                                        Op.getValueType());
31419         break;
31420       }
31421     }
31422     return;
31423   case 'J':
31424     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31425       if (C->getZExtValue() <= 63) {
31426         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31427                                        Op.getValueType());
31428         break;
31429       }
31430     }
31431     return;
31432   case 'K':
31433     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31434       if (isInt<8>(C->getSExtValue())) {
31435         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31436                                        Op.getValueType());
31437         break;
31438       }
31439     }
31440     return;
31441   case 'L':
31442     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31443       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
31444           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
31445         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
31446                                        Op.getValueType());
31447         break;
31448       }
31449     }
31450     return;
31451   case 'M':
31452     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31453       if (C->getZExtValue() <= 3) {
31454         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31455                                        Op.getValueType());
31456         break;
31457       }
31458     }
31459     return;
31460   case 'N':
31461     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31462       if (C->getZExtValue() <= 255) {
31463         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31464                                        Op.getValueType());
31465         break;
31466       }
31467     }
31468     return;
31469   case 'O':
31470     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31471       if (C->getZExtValue() <= 127) {
31472         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31473                                        Op.getValueType());
31474         break;
31475       }
31476     }
31477     return;
31478   case 'e': {
31479     // 32-bit signed value
31480     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31481       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31482                                            C->getSExtValue())) {
31483         // Widen to 64 bits here to get it sign extended.
31484         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
31485         break;
31486       }
31487     // FIXME gcc accepts some relocatable values here too, but only in certain
31488     // memory models; it's complicated.
31489     }
31490     return;
31491   }
31492   case 'Z': {
31493     // 32-bit unsigned value
31494     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31495       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31496                                            C->getZExtValue())) {
31497         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31498                                        Op.getValueType());
31499         break;
31500       }
31501     }
31502     // FIXME gcc accepts some relocatable values here too, but only in certain
31503     // memory models; it's complicated.
31504     return;
31505   }
31506   case 'i': {
31507     // Literal immediates are always ok.
31508     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
31509       // Widen to 64 bits here to get it sign extended.
31510       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
31511       break;
31512     }
31513
31514     // In any sort of PIC mode addresses need to be computed at runtime by
31515     // adding in a register or some sort of table lookup.  These can't
31516     // be used as immediates.
31517     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
31518       return;
31519
31520     // If we are in non-pic codegen mode, we allow the address of a global (with
31521     // an optional displacement) to be used with 'i'.
31522     GlobalAddressSDNode *GA = nullptr;
31523     int64_t Offset = 0;
31524
31525     // Match either (GA), (GA+C), (GA+C1+C2), etc.
31526     while (1) {
31527       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
31528         Offset += GA->getOffset();
31529         break;
31530       } else if (Op.getOpcode() == ISD::ADD) {
31531         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31532           Offset += C->getZExtValue();
31533           Op = Op.getOperand(0);
31534           continue;
31535         }
31536       } else if (Op.getOpcode() == ISD::SUB) {
31537         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31538           Offset += -C->getZExtValue();
31539           Op = Op.getOperand(0);
31540           continue;
31541         }
31542       }
31543
31544       // Otherwise, this isn't something we can handle, reject it.
31545       return;
31546     }
31547
31548     const GlobalValue *GV = GA->getGlobal();
31549     // If we require an extra load to get this address, as in PIC mode, we
31550     // can't accept it.
31551     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
31552       return;
31553
31554     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
31555                                         GA->getValueType(0), Offset);
31556     break;
31557   }
31558   }
31559
31560   if (Result.getNode()) {
31561     Ops.push_back(Result);
31562     return;
31563   }
31564   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
31565 }
31566
31567 /// Check if \p RC is a general purpose register class.
31568 /// I.e., GR* or one of their variant.
31569 static bool isGRClass(const TargetRegisterClass &RC) {
31570   switch (RC.getID()) {
31571   case X86::GR8RegClassID:
31572   case X86::GR8_ABCD_LRegClassID:
31573   case X86::GR8_ABCD_HRegClassID:
31574   case X86::GR8_NOREXRegClassID:
31575   case X86::GR16RegClassID:
31576   case X86::GR16_ABCDRegClassID:
31577   case X86::GR16_NOREXRegClassID:
31578   case X86::GR32RegClassID:
31579   case X86::GR32_ABCDRegClassID:
31580   case X86::GR32_TCRegClassID:
31581   case X86::GR32_NOREXRegClassID:
31582   case X86::GR32_NOAXRegClassID:
31583   case X86::GR32_NOSPRegClassID:
31584   case X86::GR32_NOREX_NOSPRegClassID:
31585   case X86::GR32_ADRegClassID:
31586   case X86::GR64RegClassID:
31587   case X86::GR64_ABCDRegClassID:
31588   case X86::GR64_TCRegClassID:
31589   case X86::GR64_TCW64RegClassID:
31590   case X86::GR64_NOREXRegClassID:
31591   case X86::GR64_NOSPRegClassID:
31592   case X86::GR64_NOREX_NOSPRegClassID:
31593   case X86::LOW32_ADDR_ACCESSRegClassID:
31594   case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
31595     return true;
31596   default:
31597     return false;
31598   }
31599 }
31600
31601 /// Check if \p RC is a vector register class.
31602 /// I.e., FR* / VR* or one of their variant.
31603 static bool isFRClass(const TargetRegisterClass &RC) {
31604   switch (RC.getID()) {
31605   case X86::FR32RegClassID:
31606   case X86::FR32XRegClassID:
31607   case X86::FR64RegClassID:
31608   case X86::FR64XRegClassID:
31609   case X86::FR128RegClassID:
31610   case X86::VR64RegClassID:
31611   case X86::VR128RegClassID:
31612   case X86::VR128LRegClassID:
31613   case X86::VR128HRegClassID:
31614   case X86::VR128XRegClassID:
31615   case X86::VR256RegClassID:
31616   case X86::VR256LRegClassID:
31617   case X86::VR256HRegClassID:
31618   case X86::VR256XRegClassID:
31619   case X86::VR512RegClassID:
31620     return true;
31621   default:
31622     return false;
31623   }
31624 }
31625
31626 std::pair<unsigned, const TargetRegisterClass *>
31627 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
31628                                                 StringRef Constraint,
31629                                                 MVT VT) const {
31630   // First, see if this is a constraint that directly corresponds to an LLVM
31631   // register class.
31632   if (Constraint.size() == 1) {
31633     // GCC Constraint Letters
31634     switch (Constraint[0]) {
31635     default: break;
31636       // TODO: Slight differences here in allocation order and leaving
31637       // RIP in the class. Do they matter any more here than they do
31638       // in the normal allocation?
31639     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
31640       if (Subtarget.is64Bit()) {
31641         if (VT == MVT::i32 || VT == MVT::f32)
31642           return std::make_pair(0U, &X86::GR32RegClass);
31643         if (VT == MVT::i16)
31644           return std::make_pair(0U, &X86::GR16RegClass);
31645         if (VT == MVT::i8 || VT == MVT::i1)
31646           return std::make_pair(0U, &X86::GR8RegClass);
31647         if (VT == MVT::i64 || VT == MVT::f64)
31648           return std::make_pair(0U, &X86::GR64RegClass);
31649         break;
31650       }
31651       // 32-bit fallthrough
31652     case 'Q':   // Q_REGS
31653       if (VT == MVT::i32 || VT == MVT::f32)
31654         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
31655       if (VT == MVT::i16)
31656         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
31657       if (VT == MVT::i8 || VT == MVT::i1)
31658         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
31659       if (VT == MVT::i64)
31660         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
31661       break;
31662     case 'r':   // GENERAL_REGS
31663     case 'l':   // INDEX_REGS
31664       if (VT == MVT::i8 || VT == MVT::i1)
31665         return std::make_pair(0U, &X86::GR8RegClass);
31666       if (VT == MVT::i16)
31667         return std::make_pair(0U, &X86::GR16RegClass);
31668       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
31669         return std::make_pair(0U, &X86::GR32RegClass);
31670       return std::make_pair(0U, &X86::GR64RegClass);
31671     case 'R':   // LEGACY_REGS
31672       if (VT == MVT::i8 || VT == MVT::i1)
31673         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
31674       if (VT == MVT::i16)
31675         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
31676       if (VT == MVT::i32 || !Subtarget.is64Bit())
31677         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
31678       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
31679     case 'f':  // FP Stack registers.
31680       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
31681       // value to the correct fpstack register class.
31682       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
31683         return std::make_pair(0U, &X86::RFP32RegClass);
31684       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
31685         return std::make_pair(0U, &X86::RFP64RegClass);
31686       return std::make_pair(0U, &X86::RFP80RegClass);
31687     case 'y':   // MMX_REGS if MMX allowed.
31688       if (!Subtarget.hasMMX()) break;
31689       return std::make_pair(0U, &X86::VR64RegClass);
31690     case 'Y':   // SSE_REGS if SSE2 allowed
31691       if (!Subtarget.hasSSE2()) break;
31692       // FALL THROUGH.
31693     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
31694       if (!Subtarget.hasSSE1()) break;
31695
31696       switch (VT.SimpleTy) {
31697       default: break;
31698       // Scalar SSE types.
31699       case MVT::f32:
31700       case MVT::i32:
31701         return std::make_pair(0U, &X86::FR32RegClass);
31702       case MVT::f64:
31703       case MVT::i64:
31704         return std::make_pair(0U, &X86::FR64RegClass);
31705       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31706       // Vector types.
31707       case MVT::v16i8:
31708       case MVT::v8i16:
31709       case MVT::v4i32:
31710       case MVT::v2i64:
31711       case MVT::v4f32:
31712       case MVT::v2f64:
31713         return std::make_pair(0U, &X86::VR128RegClass);
31714       // AVX types.
31715       case MVT::v32i8:
31716       case MVT::v16i16:
31717       case MVT::v8i32:
31718       case MVT::v4i64:
31719       case MVT::v8f32:
31720       case MVT::v4f64:
31721         return std::make_pair(0U, &X86::VR256RegClass);
31722       case MVT::v8f64:
31723       case MVT::v16f32:
31724       case MVT::v16i32:
31725       case MVT::v8i64:
31726         return std::make_pair(0U, &X86::VR512RegClass);
31727       }
31728       break;
31729     }
31730   }
31731
31732   // Use the default implementation in TargetLowering to convert the register
31733   // constraint into a member of a register class.
31734   std::pair<unsigned, const TargetRegisterClass*> Res;
31735   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
31736
31737   // Not found as a standard register?
31738   if (!Res.second) {
31739     // Map st(0) -> st(7) -> ST0
31740     if (Constraint.size() == 7 && Constraint[0] == '{' &&
31741         tolower(Constraint[1]) == 's' &&
31742         tolower(Constraint[2]) == 't' &&
31743         Constraint[3] == '(' &&
31744         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
31745         Constraint[5] == ')' &&
31746         Constraint[6] == '}') {
31747
31748       Res.first = X86::FP0+Constraint[4]-'0';
31749       Res.second = &X86::RFP80RegClass;
31750       return Res;
31751     }
31752
31753     // GCC allows "st(0)" to be called just plain "st".
31754     if (StringRef("{st}").equals_lower(Constraint)) {
31755       Res.first = X86::FP0;
31756       Res.second = &X86::RFP80RegClass;
31757       return Res;
31758     }
31759
31760     // flags -> EFLAGS
31761     if (StringRef("{flags}").equals_lower(Constraint)) {
31762       Res.first = X86::EFLAGS;
31763       Res.second = &X86::CCRRegClass;
31764       return Res;
31765     }
31766
31767     // 'A' means EAX + EDX.
31768     if (Constraint == "A") {
31769       Res.first = X86::EAX;
31770       Res.second = &X86::GR32_ADRegClass;
31771       return Res;
31772     }
31773     return Res;
31774   }
31775
31776   // Otherwise, check to see if this is a register class of the wrong value
31777   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
31778   // turn into {ax},{dx}.
31779   // MVT::Other is used to specify clobber names.
31780   if (Res.second->hasType(VT) || VT == MVT::Other)
31781     return Res;   // Correct type already, nothing to do.
31782
31783   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
31784   // return "eax". This should even work for things like getting 64bit integer
31785   // registers when given an f64 type.
31786   const TargetRegisterClass *Class = Res.second;
31787   // The generic code will match the first register class that contains the
31788   // given register. Thus, based on the ordering of the tablegened file,
31789   // the "plain" GR classes might not come first.
31790   // Therefore, use a helper method.
31791   if (isGRClass(*Class)) {
31792     unsigned Size = VT.getSizeInBits();
31793     if (Size == 1) Size = 8;
31794     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
31795     if (DestReg > 0) {
31796       Res.first = DestReg;
31797       Res.second = Size == 8 ? &X86::GR8RegClass
31798                  : Size == 16 ? &X86::GR16RegClass
31799                  : Size == 32 ? &X86::GR32RegClass
31800                  : &X86::GR64RegClass;
31801       assert(Res.second->contains(Res.first) && "Register in register class");
31802     } else {
31803       // No register found/type mismatch.
31804       Res.first = 0;
31805       Res.second = nullptr;
31806     }
31807   } else if (isFRClass(*Class)) {
31808     // Handle references to XMM physical registers that got mapped into the
31809     // wrong class.  This can happen with constraints like {xmm0} where the
31810     // target independent register mapper will just pick the first match it can
31811     // find, ignoring the required type.
31812
31813     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31814     if (VT == MVT::f32 || VT == MVT::i32)
31815       Res.second = &X86::FR32RegClass;
31816     else if (VT == MVT::f64 || VT == MVT::i64)
31817       Res.second = &X86::FR64RegClass;
31818     else if (X86::VR128RegClass.hasType(VT))
31819       Res.second = &X86::VR128RegClass;
31820     else if (X86::VR256RegClass.hasType(VT))
31821       Res.second = &X86::VR256RegClass;
31822     else if (X86::VR512RegClass.hasType(VT))
31823       Res.second = &X86::VR512RegClass;
31824     else {
31825       // Type mismatch and not a clobber: Return an error;
31826       Res.first = 0;
31827       Res.second = nullptr;
31828     }
31829   }
31830
31831   return Res;
31832 }
31833
31834 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
31835                                             const AddrMode &AM, Type *Ty,
31836                                             unsigned AS) const {
31837   // Scaling factors are not free at all.
31838   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
31839   // will take 2 allocations in the out of order engine instead of 1
31840   // for plain addressing mode, i.e. inst (reg1).
31841   // E.g.,
31842   // vaddps (%rsi,%drx), %ymm0, %ymm1
31843   // Requires two allocations (one for the load, one for the computation)
31844   // whereas:
31845   // vaddps (%rsi), %ymm0, %ymm1
31846   // Requires just 1 allocation, i.e., freeing allocations for other operations
31847   // and having less micro operations to execute.
31848   //
31849   // For some X86 architectures, this is even worse because for instance for
31850   // stores, the complex addressing mode forces the instruction to use the
31851   // "load" ports instead of the dedicated "store" port.
31852   // E.g., on Haswell:
31853   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
31854   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
31855   if (isLegalAddressingMode(DL, AM, Ty, AS))
31856     // Scale represents reg2 * scale, thus account for 1
31857     // as soon as we use a second register.
31858     return AM.Scale != 0;
31859   return -1;
31860 }
31861
31862 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
31863   // Integer division on x86 is expensive. However, when aggressively optimizing
31864   // for code size, we prefer to use a div instruction, as it is usually smaller
31865   // than the alternative sequence.
31866   // The exception to this is vector division. Since x86 doesn't have vector
31867   // integer division, leaving the division as-is is a loss even in terms of
31868   // size, because it will have to be scalarized, while the alternative code
31869   // sequence can be performed in vector form.
31870   bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
31871                                    Attribute::MinSize);
31872   return OptSize && !VT.isVector();
31873 }
31874
31875 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
31876   if (!Subtarget.is64Bit())
31877     return;
31878
31879   // Update IsSplitCSR in X86MachineFunctionInfo.
31880   X86MachineFunctionInfo *AFI =
31881     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
31882   AFI->setIsSplitCSR(true);
31883 }
31884
31885 void X86TargetLowering::insertCopiesSplitCSR(
31886     MachineBasicBlock *Entry,
31887     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
31888   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
31889   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
31890   if (!IStart)
31891     return;
31892
31893   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31894   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
31895   MachineBasicBlock::iterator MBBI = Entry->begin();
31896   for (const MCPhysReg *I = IStart; *I; ++I) {
31897     const TargetRegisterClass *RC = nullptr;
31898     if (X86::GR64RegClass.contains(*I))
31899       RC = &X86::GR64RegClass;
31900     else
31901       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
31902
31903     unsigned NewVR = MRI->createVirtualRegister(RC);
31904     // Create copy from CSR to a virtual register.
31905     // FIXME: this currently does not emit CFI pseudo-instructions, it works
31906     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
31907     // nounwind. If we want to generalize this later, we may need to emit
31908     // CFI pseudo-instructions.
31909     assert(Entry->getParent()->getFunction()->hasFnAttribute(
31910                Attribute::NoUnwind) &&
31911            "Function should be nounwind in insertCopiesSplitCSR!");
31912     Entry->addLiveIn(*I);
31913     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
31914         .addReg(*I);
31915
31916     // Insert the copy-back instructions right before the terminator.
31917     for (auto *Exit : Exits)
31918       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
31919               TII->get(TargetOpcode::COPY), *I)
31920           .addReg(NewVR);
31921   }
31922 }