1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/EHPersonalities.h"
30 #include "llvm/CodeGen/IntrinsicLowering.h"
31 #include "llvm/CodeGen/MachineFrameInfo.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineJumpTableInfo.h"
35 #include "llvm/CodeGen/MachineModuleInfo.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetLowering.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/GlobalAlias.h"
46 #include "llvm/IR/GlobalVariable.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/Intrinsics.h"
49 #include "llvm/MC/MCAsmInfo.h"
50 #include "llvm/MC/MCContext.h"
51 #include "llvm/MC/MCExpr.h"
52 #include "llvm/MC/MCSymbol.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/ErrorHandling.h"
56 #include "llvm/Support/KnownBits.h"
57 #include "llvm/Support/MathExtras.h"
58 #include "llvm/Target/TargetOptions.h"
65 #define DEBUG_TYPE "x86-isel"
67 STATISTIC(NumTailCalls, "Number of tail calls");
69 static cl::opt<bool> ExperimentalVectorWideningLegalization(
70 "x86-experimental-vector-widening-legalization", cl::init(false),
71 cl::desc("Enable an experimental vector type legalization through widening "
72 "rather than promotion."),
75 static cl::opt<int> ExperimentalPrefLoopAlignment(
76 "x86-experimental-pref-loop-alignment", cl::init(4),
77 cl::desc("Sets the preferable loop alignment for experiments "
78 "(the last x86-experimental-pref-loop-alignment bits"
79 " of the loop header PC will be 0)."),
82 static cl::opt<bool> MulConstantOptimization(
83 "mul-constant-optimization", cl::init(true),
84 cl::desc("Replace 'mul x, Const' with more effective instructions like "
88 /// Call this when the user attempts to do something unsupported, like
89 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
90 /// report_fatal_error, so calling code should attempt to recover without
92 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
94 MachineFunction &MF = DAG.getMachineFunction();
95 DAG.getContext()->diagnose(
96 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
99 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
100 const X86Subtarget &STI)
101 : TargetLowering(TM), Subtarget(STI) {
102 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
103 X86ScalarSSEf64 = Subtarget.hasSSE2();
104 X86ScalarSSEf32 = Subtarget.hasSSE1();
105 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
107 // Set up the TargetLowering object.
109 // X86 is weird. It always uses i8 for shift amounts and setcc results.
110 setBooleanContents(ZeroOrOneBooleanContent);
111 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
112 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
114 // For 64-bit, since we have so many registers, use the ILP scheduler.
115 // For 32-bit, use the register pressure specific scheduling.
116 // For Atom, always use ILP scheduling.
117 if (Subtarget.isAtom())
118 setSchedulingPreference(Sched::ILP);
119 else if (Subtarget.is64Bit())
120 setSchedulingPreference(Sched::ILP);
122 setSchedulingPreference(Sched::RegPressure);
123 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
124 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
126 // Bypass expensive divides and use cheaper ones.
127 if (TM.getOptLevel() >= CodeGenOpt::Default) {
128 if (Subtarget.hasSlowDivide32())
129 addBypassSlowDiv(32, 8);
130 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
131 addBypassSlowDiv(64, 32);
134 if (Subtarget.isTargetKnownWindowsMSVC() ||
135 Subtarget.isTargetWindowsItanium()) {
136 // Setup Windows compiler runtime calls.
137 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
138 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
139 setLibcallName(RTLIB::SREM_I64, "_allrem");
140 setLibcallName(RTLIB::UREM_I64, "_aullrem");
141 setLibcallName(RTLIB::MUL_I64, "_allmul");
142 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
143 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
149 if (Subtarget.isTargetDarwin()) {
150 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
151 setUseUnderscoreSetJmp(false);
152 setUseUnderscoreLongJmp(false);
153 } else if (Subtarget.isTargetWindowsGNU()) {
154 // MS runtime is weird: it exports _setjmp, but longjmp!
155 setUseUnderscoreSetJmp(true);
156 setUseUnderscoreLongJmp(false);
158 setUseUnderscoreSetJmp(true);
159 setUseUnderscoreLongJmp(true);
162 // Set up the register classes.
163 addRegisterClass(MVT::i8, &X86::GR8RegClass);
164 addRegisterClass(MVT::i16, &X86::GR16RegClass);
165 addRegisterClass(MVT::i32, &X86::GR32RegClass);
166 if (Subtarget.is64Bit())
167 addRegisterClass(MVT::i64, &X86::GR64RegClass);
169 for (MVT VT : MVT::integer_valuetypes())
170 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
172 // We don't accept any truncstore of integer registers.
173 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
174 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
176 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
177 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
178 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
180 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
182 // SETOEQ and SETUNE require checking two conditions.
183 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
184 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
186 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
191 if (Subtarget.hasCMov()) {
192 setOperationAction(ISD::ABS , MVT::i16 , Custom);
193 setOperationAction(ISD::ABS , MVT::i32 , Custom);
194 if (Subtarget.is64Bit())
195 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
200 setOperationAction(ShiftOp , MVT::i16 , Custom);
201 setOperationAction(ShiftOp , MVT::i32 , Custom);
202 if (Subtarget.is64Bit())
203 setOperationAction(ShiftOp , MVT::i64 , Custom);
206 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
208 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
209 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
212 if (Subtarget.is64Bit()) {
213 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
214 // f32/f64 are legal, f80 is custom.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
217 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
218 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
219 } else if (!Subtarget.useSoftFloat()) {
220 // We have an algorithm for SSE2->double, and we turn this into a
221 // 64-bit FILD followed by conditional FADD for other targets.
222 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
223 // We have an algorithm for SSE2, and we turn this into a 64-bit
224 // FILD or VCVTUSI2SS/SD for other targets.
225 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
227 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
230 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
232 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
233 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
235 if (!Subtarget.useSoftFloat()) {
236 // SSE has no i16 to fp conversion, only i32.
237 if (X86ScalarSSEf32) {
238 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
239 // f32 and f64 cases are Legal, f80 case is not
240 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
242 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
243 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
246 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
247 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
250 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
252 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
253 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
255 if (!Subtarget.useSoftFloat()) {
256 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
257 // are Legal, f80 is custom lowered.
258 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
259 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
261 if (X86ScalarSSEf32) {
262 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
263 // f32 and f64 cases are Legal, f80 case is not
264 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
266 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
267 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
270 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
271 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
272 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
275 // Handle FP_TO_UINT by promoting the destination to a larger signed
277 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
279 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
281 if (Subtarget.is64Bit()) {
282 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
283 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
284 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
285 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
287 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
288 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
290 } else if (!Subtarget.useSoftFloat()) {
291 // Since AVX is a superset of SSE3, only check for SSE here.
292 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
293 // Expand FP_TO_UINT into a select.
294 // FIXME: We would like to use a Custom expander here eventually to do
295 // the optimal thing for SSE vs. the default expansion in the legalizer.
296 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
298 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
299 // With SSE3 we can use fisttpll to convert to a signed i64; without
300 // SSE, we're stuck with a fistpll.
301 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
303 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
306 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
307 if (!X86ScalarSSEf64) {
308 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
309 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
310 if (Subtarget.is64Bit()) {
311 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
312 // Without SSE, i64->f64 goes through memory.
313 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
315 } else if (!Subtarget.is64Bit())
316 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
318 // Scalar integer divide and remainder are lowered to use operations that
319 // produce two results, to match the available instructions. This exposes
320 // the two-result form to trivial CSE, which is able to combine x/y and x%y
321 // into a single instruction.
323 // Scalar integer multiply-high is also lowered to use two-result
324 // operations, to match the available instructions. However, plain multiply
325 // (low) operations are left as Legal, as there are single-result
326 // instructions for this in x86. Using the two-result multiply instructions
327 // when both high and low results are needed must be arranged by dagcombine.
328 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
329 setOperationAction(ISD::MULHS, VT, Expand);
330 setOperationAction(ISD::MULHU, VT, Expand);
331 setOperationAction(ISD::SDIV, VT, Expand);
332 setOperationAction(ISD::UDIV, VT, Expand);
333 setOperationAction(ISD::SREM, VT, Expand);
334 setOperationAction(ISD::UREM, VT, Expand);
337 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
338 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
339 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
340 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
341 setOperationAction(ISD::BR_CC, VT, Expand);
342 setOperationAction(ISD::SELECT_CC, VT, Expand);
344 if (Subtarget.is64Bit())
345 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
346 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
347 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
348 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
349 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
351 setOperationAction(ISD::FREM , MVT::f32 , Expand);
352 setOperationAction(ISD::FREM , MVT::f64 , Expand);
353 setOperationAction(ISD::FREM , MVT::f80 , Expand);
354 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
356 // Promote the i8 variants and force them on up to i32 which has a shorter
358 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
359 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
360 if (!Subtarget.hasBMI()) {
361 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
362 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
363 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
364 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
365 if (Subtarget.is64Bit()) {
366 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
367 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
371 if (Subtarget.hasLZCNT()) {
372 // When promoting the i8 variants, force them to i32 for a shorter
374 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
375 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
377 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
378 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
379 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
380 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
381 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
382 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
383 if (Subtarget.is64Bit()) {
384 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
385 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
389 // Special handling for half-precision floating point conversions.
390 // If we don't have F16C support, then lower half float conversions
391 // into library calls.
392 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
393 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
394 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
397 // There's never any support for operations beyond MVT::f32.
398 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
399 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
400 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
401 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
403 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
404 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
405 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
406 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
407 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
408 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
410 if (Subtarget.hasPOPCNT()) {
411 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
413 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
414 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
415 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
416 if (Subtarget.is64Bit())
417 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
420 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
422 if (!Subtarget.hasMOVBE())
423 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
425 // These should be promoted to a larger select which is supported.
426 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
427 // X86 wants to expand cmov itself.
428 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
429 setOperationAction(ISD::SELECT, VT, Custom);
430 setOperationAction(ISD::SETCC, VT, Custom);
432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
433 if (VT == MVT::i64 && !Subtarget.is64Bit())
435 setOperationAction(ISD::SELECT, VT, Custom);
436 setOperationAction(ISD::SETCC, VT, Custom);
439 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
440 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
441 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
443 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
444 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
445 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
446 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
447 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
448 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
449 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
450 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
453 for (auto VT : { MVT::i32, MVT::i64 }) {
454 if (VT == MVT::i64 && !Subtarget.is64Bit())
456 setOperationAction(ISD::ConstantPool , VT, Custom);
457 setOperationAction(ISD::JumpTable , VT, Custom);
458 setOperationAction(ISD::GlobalAddress , VT, Custom);
459 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
460 setOperationAction(ISD::ExternalSymbol , VT, Custom);
461 setOperationAction(ISD::BlockAddress , VT, Custom);
464 // 64-bit shl, sra, srl (iff 32-bit x86)
465 for (auto VT : { MVT::i32, MVT::i64 }) {
466 if (VT == MVT::i64 && !Subtarget.is64Bit())
468 setOperationAction(ISD::SHL_PARTS, VT, Custom);
469 setOperationAction(ISD::SRA_PARTS, VT, Custom);
470 setOperationAction(ISD::SRL_PARTS, VT, Custom);
473 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
474 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
476 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
478 // Expand certain atomics
479 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
480 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
481 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
482 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
483 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
484 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
485 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
486 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
489 if (Subtarget.hasCmpxchg16b()) {
490 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
493 // FIXME - use subtarget debug flags
494 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
495 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
496 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
497 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
500 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
501 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
503 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
504 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
506 setOperationAction(ISD::TRAP, MVT::Other, Legal);
507 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
509 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
510 setOperationAction(ISD::VASTART , MVT::Other, Custom);
511 setOperationAction(ISD::VAEND , MVT::Other, Expand);
512 bool Is64Bit = Subtarget.is64Bit();
513 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
514 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
516 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
517 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
519 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
521 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
522 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
523 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
525 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
526 // f32 and f64 use SSE.
527 // Set up the FP register classes.
528 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
529 : &X86::FR32RegClass);
530 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
531 : &X86::FR64RegClass);
533 for (auto VT : { MVT::f32, MVT::f64 }) {
534 // Use ANDPD to simulate FABS.
535 setOperationAction(ISD::FABS, VT, Custom);
537 // Use XORP to simulate FNEG.
538 setOperationAction(ISD::FNEG, VT, Custom);
540 // Use ANDPD and ORPD to simulate FCOPYSIGN.
541 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
543 // These might be better off as horizontal vector ops.
544 setOperationAction(ISD::FADD, VT, Custom);
545 setOperationAction(ISD::FSUB, VT, Custom);
547 // We don't support sin/cos/fmod
548 setOperationAction(ISD::FSIN , VT, Expand);
549 setOperationAction(ISD::FCOS , VT, Expand);
550 setOperationAction(ISD::FSINCOS, VT, Expand);
553 // Lower this to MOVMSK plus an AND.
554 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
555 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
557 } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
558 // Use SSE for f32, x87 for f64.
559 // Set up the FP register classes.
560 addRegisterClass(MVT::f32, &X86::FR32RegClass);
562 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
564 // Use ANDPS to simulate FABS.
565 setOperationAction(ISD::FABS , MVT::f32, Custom);
567 // Use XORP to simulate FNEG.
568 setOperationAction(ISD::FNEG , MVT::f32, Custom);
571 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
573 // Use ANDPS and ORPS to simulate FCOPYSIGN.
575 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
576 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
578 // We don't support sin/cos/fmod
579 setOperationAction(ISD::FSIN , MVT::f32, Expand);
580 setOperationAction(ISD::FCOS , MVT::f32, Expand);
581 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
584 // Always expand sin/cos functions even though x87 has an instruction.
585 setOperationAction(ISD::FSIN, MVT::f64, Expand);
586 setOperationAction(ISD::FCOS, MVT::f64, Expand);
587 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
590 // f32 and f64 in x87.
591 // Set up the FP register classes.
592 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
593 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
595 for (auto VT : { MVT::f32, MVT::f64 }) {
596 setOperationAction(ISD::UNDEF, VT, Expand);
597 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
599 // Always expand sin/cos functions even though x87 has an instruction.
600 setOperationAction(ISD::FSIN , VT, Expand);
601 setOperationAction(ISD::FCOS , VT, Expand);
602 setOperationAction(ISD::FSINCOS, VT, Expand);
606 // Expand FP32 immediates into loads from the stack, save special cases.
607 if (isTypeLegal(MVT::f32)) {
608 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
609 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
610 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
611 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
612 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
613 } else // SSE immediates.
614 addLegalFPImmediate(APFloat(+0.0f)); // xorps
616 // Expand FP64 immediates into loads from the stack, save special cases.
617 if (isTypeLegal(MVT::f64)) {
618 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
619 addLegalFPImmediate(APFloat(+0.0)); // FLD0
620 addLegalFPImmediate(APFloat(+1.0)); // FLD1
621 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
622 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
623 } else // SSE immediates.
624 addLegalFPImmediate(APFloat(+0.0)); // xorpd
627 // We don't support FMA.
628 setOperationAction(ISD::FMA, MVT::f64, Expand);
629 setOperationAction(ISD::FMA, MVT::f32, Expand);
631 // Long double always uses X87, except f128 in MMX.
633 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
634 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
635 : &X86::VR128RegClass);
636 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
637 setOperationAction(ISD::FABS , MVT::f128, Custom);
638 setOperationAction(ISD::FNEG , MVT::f128, Custom);
639 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
642 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
643 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
644 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
646 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
647 addLegalFPImmediate(TmpFlt); // FLD0
649 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
652 APFloat TmpFlt2(+1.0);
653 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
655 addLegalFPImmediate(TmpFlt2); // FLD1
656 TmpFlt2.changeSign();
657 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
660 // Always expand sin/cos functions even though x87 has an instruction.
661 setOperationAction(ISD::FSIN , MVT::f80, Expand);
662 setOperationAction(ISD::FCOS , MVT::f80, Expand);
663 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
665 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
666 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
667 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
668 setOperationAction(ISD::FRINT, MVT::f80, Expand);
669 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
670 setOperationAction(ISD::FMA, MVT::f80, Expand);
673 // Always use a library call for pow.
674 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
675 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
676 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
678 setOperationAction(ISD::FLOG, MVT::f80, Expand);
679 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
680 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
681 setOperationAction(ISD::FEXP, MVT::f80, Expand);
682 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
683 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
684 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
686 // Some FP actions are always expanded for vector types.
687 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
688 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
689 setOperationAction(ISD::FSIN, VT, Expand);
690 setOperationAction(ISD::FSINCOS, VT, Expand);
691 setOperationAction(ISD::FCOS, VT, Expand);
692 setOperationAction(ISD::FREM, VT, Expand);
693 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
694 setOperationAction(ISD::FPOW, VT, Expand);
695 setOperationAction(ISD::FLOG, VT, Expand);
696 setOperationAction(ISD::FLOG2, VT, Expand);
697 setOperationAction(ISD::FLOG10, VT, Expand);
698 setOperationAction(ISD::FEXP, VT, Expand);
699 setOperationAction(ISD::FEXP2, VT, Expand);
702 // First set operation action for all vector types to either promote
703 // (for widening) or expand (for scalarization). Then we will selectively
704 // turn on ones that can be effectively codegen'd.
705 for (MVT VT : MVT::vector_valuetypes()) {
706 setOperationAction(ISD::SDIV, VT, Expand);
707 setOperationAction(ISD::UDIV, VT, Expand);
708 setOperationAction(ISD::SREM, VT, Expand);
709 setOperationAction(ISD::UREM, VT, Expand);
710 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
711 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
712 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
713 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
714 setOperationAction(ISD::FMA, VT, Expand);
715 setOperationAction(ISD::FFLOOR, VT, Expand);
716 setOperationAction(ISD::FCEIL, VT, Expand);
717 setOperationAction(ISD::FTRUNC, VT, Expand);
718 setOperationAction(ISD::FRINT, VT, Expand);
719 setOperationAction(ISD::FNEARBYINT, VT, Expand);
720 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
721 setOperationAction(ISD::MULHS, VT, Expand);
722 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
723 setOperationAction(ISD::MULHU, VT, Expand);
724 setOperationAction(ISD::SDIVREM, VT, Expand);
725 setOperationAction(ISD::UDIVREM, VT, Expand);
726 setOperationAction(ISD::CTPOP, VT, Expand);
727 setOperationAction(ISD::CTTZ, VT, Expand);
728 setOperationAction(ISD::CTLZ, VT, Expand);
729 setOperationAction(ISD::ROTL, VT, Expand);
730 setOperationAction(ISD::ROTR, VT, Expand);
731 setOperationAction(ISD::BSWAP, VT, Expand);
732 setOperationAction(ISD::SETCC, VT, Expand);
733 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
734 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
735 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
736 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
737 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
738 setOperationAction(ISD::TRUNCATE, VT, Expand);
739 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
740 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
741 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
742 setOperationAction(ISD::SELECT_CC, VT, Expand);
743 for (MVT InnerVT : MVT::vector_valuetypes()) {
744 setTruncStoreAction(InnerVT, VT, Expand);
746 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
747 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
749 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
750 // types, we have to deal with them whether we ask for Expansion or not.
751 // Setting Expand causes its own optimisation problems though, so leave
753 if (VT.getVectorElementType() == MVT::i1)
754 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
756 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
757 // split/scalarized right now.
758 if (VT.getVectorElementType() == MVT::f16)
759 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
763 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
764 // with -msoft-float, disable use of MMX as well.
765 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
766 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
767 // No operations on x86mmx supported, everything uses intrinsics.
770 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
771 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
772 : &X86::VR128RegClass);
774 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
775 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
776 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
777 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
778 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
779 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
780 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
781 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
782 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
785 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
786 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
787 : &X86::VR128RegClass);
789 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
790 // registers cannot be used even for integer operations.
791 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
792 : &X86::VR128RegClass);
793 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
794 : &X86::VR128RegClass);
795 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
796 : &X86::VR128RegClass);
797 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
798 : &X86::VR128RegClass);
800 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
801 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
802 setOperationAction(ISD::SDIV, VT, Custom);
803 setOperationAction(ISD::SREM, VT, Custom);
804 setOperationAction(ISD::UDIV, VT, Custom);
805 setOperationAction(ISD::UREM, VT, Custom);
808 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
809 setOperationAction(ISD::MUL, MVT::v2i16, Custom);
810 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
811 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
812 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
813 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
815 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
816 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
817 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
818 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
819 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
820 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
821 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
822 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
823 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
824 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
825 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
826 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
827 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
829 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
830 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
831 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
832 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
833 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
836 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
837 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
838 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
839 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
840 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
841 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
842 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
843 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
845 if (!ExperimentalVectorWideningLegalization) {
846 // Use widening instead of promotion.
847 for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
848 MVT::v4i16, MVT::v2i16 }) {
849 setOperationAction(ISD::UADDSAT, VT, Custom);
850 setOperationAction(ISD::SADDSAT, VT, Custom);
851 setOperationAction(ISD::USUBSAT, VT, Custom);
852 setOperationAction(ISD::SSUBSAT, VT, Custom);
856 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
857 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
858 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
860 // Provide custom widening for v2f32 setcc. This is really for VLX when
861 // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
862 // type legalization changing the result type to v4i1 during widening.
863 // It works fine for SSE2 and is probably faster so no need to qualify with
865 setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
867 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
868 setOperationAction(ISD::SETCC, VT, Custom);
869 setOperationAction(ISD::CTPOP, VT, Custom);
870 setOperationAction(ISD::ABS, VT, Custom);
872 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
873 // setcc all the way to isel and prefer SETGT in some isel patterns.
874 setCondCodeAction(ISD::SETLT, VT, Custom);
875 setCondCodeAction(ISD::SETLE, VT, Custom);
878 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
879 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
880 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
881 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
882 setOperationAction(ISD::VSELECT, VT, Custom);
883 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
886 // We support custom legalizing of sext and anyext loads for specific
887 // memory vector types which we can load as a scalar (or sequence of
888 // scalars) and extend in-register to a legal 128-bit vector type. For sext
889 // loads these must work with a single scalar load.
890 for (MVT VT : MVT::integer_vector_valuetypes()) {
891 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
892 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
893 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
894 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
895 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
896 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
899 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
900 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
901 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
902 setOperationAction(ISD::VSELECT, VT, Custom);
904 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
907 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
908 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
911 // Custom lower v2i64 and v2f64 selects.
912 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
913 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
914 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
915 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
916 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
918 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
919 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
920 setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
922 // Custom legalize these to avoid over promotion or custom promotion.
923 setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
924 setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);
925 setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);
926 setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
927 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
928 setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);
929 setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);
930 setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);
931 setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
932 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
934 // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
935 // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
936 // split again based on the input type, this will cause an AssertSExt i16 to
937 // be emitted instead of an AssertZExt. This will allow packssdw followed by
938 // packuswb to be used to truncate to v8i8. This is necessary since packusdw
939 // isn't available until sse4.1.
940 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
942 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
943 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
945 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
947 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
948 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
950 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
951 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
953 for (MVT VT : MVT::fp_vector_valuetypes())
954 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
956 // We want to legalize this to an f64 load rather than an i64 load on
957 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
959 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
960 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
961 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
962 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
963 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
964 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
965 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
966 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
968 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
969 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
970 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
971 if (!Subtarget.hasAVX512())
972 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
974 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
975 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
976 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
978 if (ExperimentalVectorWideningLegalization) {
979 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
981 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
982 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
983 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
984 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
985 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
986 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
988 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
991 // In the customized shift lowering, the legal v4i32/v2i64 cases
992 // in AVX2 will be recognized.
993 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
994 setOperationAction(ISD::SRL, VT, Custom);
995 setOperationAction(ISD::SHL, VT, Custom);
996 setOperationAction(ISD::SRA, VT, Custom);
999 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1000 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1002 // With AVX512, expanding (and promoting the shifts) is better.
1003 if (!Subtarget.hasAVX512())
1004 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1007 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1008 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1009 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1010 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1011 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1012 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1013 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1014 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1015 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1017 // These might be better off as horizontal vector ops.
1018 setOperationAction(ISD::ADD, MVT::i16, Custom);
1019 setOperationAction(ISD::ADD, MVT::i32, Custom);
1020 setOperationAction(ISD::SUB, MVT::i16, Custom);
1021 setOperationAction(ISD::SUB, MVT::i32, Custom);
1024 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1025 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1026 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1027 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1028 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1029 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1030 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1033 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1034 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1035 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1036 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1037 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1038 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1039 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1040 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1042 // FIXME: Do we need to handle scalar-to-vector here?
1043 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1045 // We directly match byte blends in the backend as they match the VSELECT
1047 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1049 // SSE41 brings specific instructions for doing vector sign extend even in
1050 // cases where we don't have SRA.
1051 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1052 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1053 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1056 if (!ExperimentalVectorWideningLegalization) {
1057 // Avoid narrow result types when widening. The legal types are listed
1058 // in the next loop.
1059 for (MVT VT : MVT::integer_vector_valuetypes()) {
1060 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1061 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1062 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1066 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1067 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1068 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1069 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1070 if (!ExperimentalVectorWideningLegalization)
1071 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
1072 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1073 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1074 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1075 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1078 // i8 vectors are custom because the source register and source
1079 // source memory operand types are not the same width.
1080 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1083 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1084 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1085 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1086 setOperationAction(ISD::ROTL, VT, Custom);
1088 // XOP can efficiently perform BITREVERSE with VPPERM.
1089 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1090 setOperationAction(ISD::BITREVERSE, VT, Custom);
1092 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1093 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1094 setOperationAction(ISD::BITREVERSE, VT, Custom);
1097 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1098 bool HasInt256 = Subtarget.hasInt256();
1100 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1101 : &X86::VR256RegClass);
1102 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1103 : &X86::VR256RegClass);
1104 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1105 : &X86::VR256RegClass);
1106 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1107 : &X86::VR256RegClass);
1108 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1109 : &X86::VR256RegClass);
1110 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1111 : &X86::VR256RegClass);
1113 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1114 setOperationAction(ISD::FFLOOR, VT, Legal);
1115 setOperationAction(ISD::FCEIL, VT, Legal);
1116 setOperationAction(ISD::FTRUNC, VT, Legal);
1117 setOperationAction(ISD::FRINT, VT, Legal);
1118 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1119 setOperationAction(ISD::FNEG, VT, Custom);
1120 setOperationAction(ISD::FABS, VT, Custom);
1121 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1124 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1125 // even though v8i16 is a legal type.
1126 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1127 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1128 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1130 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1131 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1133 if (!Subtarget.hasAVX512())
1134 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1136 for (MVT VT : MVT::fp_vector_valuetypes())
1137 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1139 // In the customized shift lowering, the legal v8i32/v4i64 cases
1140 // in AVX2 will be recognized.
1141 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1142 setOperationAction(ISD::SRL, VT, Custom);
1143 setOperationAction(ISD::SHL, VT, Custom);
1144 setOperationAction(ISD::SRA, VT, Custom);
1147 if (ExperimentalVectorWideningLegalization) {
1148 // These types need custom splitting if their input is a 128-bit vector.
1149 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1150 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1151 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1152 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1155 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1156 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1158 // With BWI, expanding (and promoting the shifts) is the better.
1159 if (!Subtarget.hasBWI())
1160 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1162 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1163 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1164 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1165 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1166 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1167 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1169 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1170 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1171 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1172 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1175 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1176 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1177 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1178 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1180 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1181 setOperationAction(ISD::SETCC, VT, Custom);
1182 setOperationAction(ISD::CTPOP, VT, Custom);
1183 setOperationAction(ISD::CTLZ, VT, Custom);
1185 // TODO - remove this once 256-bit X86ISD::ANDNP correctly split.
1186 setOperationAction(ISD::CTTZ, VT, HasInt256 ? Expand : Custom);
1188 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1189 // setcc all the way to isel and prefer SETGT in some isel patterns.
1190 setCondCodeAction(ISD::SETLT, VT, Custom);
1191 setCondCodeAction(ISD::SETLE, VT, Custom);
1194 if (Subtarget.hasAnyFMA()) {
1195 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1196 MVT::v2f64, MVT::v4f64 })
1197 setOperationAction(ISD::FMA, VT, Legal);
1200 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1201 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1202 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1205 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1206 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1207 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1208 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1210 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1211 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1212 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1213 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1214 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1215 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1217 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1218 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1219 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1220 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1221 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1223 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1224 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1225 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1226 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1227 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1228 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1229 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1230 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1232 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1233 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1234 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1235 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1236 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1237 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1240 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1241 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1242 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1246 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1247 // when we have a 256bit-wide blend with immediate.
1248 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1250 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1251 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1252 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1253 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1254 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1255 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1256 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1257 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1261 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1262 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1263 setOperationAction(ISD::MLOAD, VT, Legal);
1264 setOperationAction(ISD::MSTORE, VT, Legal);
1267 // Extract subvector is special because the value type
1268 // (result) is 128-bit but the source is 256-bit wide.
1269 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1270 MVT::v4f32, MVT::v2f64 }) {
1271 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1274 // Custom lower several nodes for 256-bit types.
1275 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1276 MVT::v8f32, MVT::v4f64 }) {
1277 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1278 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1279 setOperationAction(ISD::VSELECT, VT, Custom);
1280 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1281 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1282 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1283 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1284 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1288 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1291 // Custom legalize 2x32 to get a little better code.
1292 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1293 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1295 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1296 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1297 setOperationAction(ISD::MGATHER, VT, Custom);
1301 // This block controls legalization of the mask vector sizes that are
1302 // available with AVX512. 512-bit vectors are in a separate block controlled
1303 // by useAVX512Regs.
1304 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1305 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1306 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1307 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1308 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1309 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1311 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1312 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1313 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1315 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1316 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1317 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1318 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1319 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1320 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1322 // There is no byte sized k-register load or store without AVX512DQ.
1323 if (!Subtarget.hasDQI()) {
1324 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1325 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1326 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1327 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1329 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1330 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1331 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1332 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1335 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1336 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1337 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1338 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1339 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1342 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1343 setOperationAction(ISD::ADD, VT, Custom);
1344 setOperationAction(ISD::SUB, VT, Custom);
1345 setOperationAction(ISD::MUL, VT, Custom);
1346 setOperationAction(ISD::SETCC, VT, Custom);
1347 setOperationAction(ISD::SELECT, VT, Custom);
1348 setOperationAction(ISD::TRUNCATE, VT, Custom);
1349 setOperationAction(ISD::UADDSAT, VT, Custom);
1350 setOperationAction(ISD::SADDSAT, VT, Custom);
1351 setOperationAction(ISD::USUBSAT, VT, Custom);
1352 setOperationAction(ISD::SSUBSAT, VT, Custom);
1354 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1355 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1356 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1357 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1358 setOperationAction(ISD::VSELECT, VT, Expand);
1361 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1362 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1363 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1364 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
1365 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1366 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1367 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1368 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1369 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1372 // This block controls legalization for 512-bit operations with 32/64 bit
1373 // elements. 512-bits can be disabled based on prefer-vector-width and
1374 // required-vector-width function attributes.
1375 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1376 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1377 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1378 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1379 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1381 for (MVT VT : MVT::fp_vector_valuetypes())
1382 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1384 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1385 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1386 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1387 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1388 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1389 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1392 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1393 setOperationAction(ISD::FNEG, VT, Custom);
1394 setOperationAction(ISD::FABS, VT, Custom);
1395 setOperationAction(ISD::FMA, VT, Legal);
1396 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1399 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1400 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1401 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1402 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1403 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1404 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1405 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1406 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1407 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1408 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1410 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1411 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1412 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1413 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1414 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1416 if (!Subtarget.hasVLX()) {
1417 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1418 // to 512-bit rather than use the AVX2 instructions so that we can use
1420 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1421 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1422 setOperationAction(ISD::MLOAD, VT, Custom);
1423 setOperationAction(ISD::MSTORE, VT, Custom);
1427 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1428 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1429 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1430 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1431 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1432 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1433 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1434 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1436 if (ExperimentalVectorWideningLegalization) {
1437 // Need to custom widen this if we don't have AVX512BW.
1438 setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
1439 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
1440 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
1443 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1444 setOperationAction(ISD::FFLOOR, VT, Legal);
1445 setOperationAction(ISD::FCEIL, VT, Legal);
1446 setOperationAction(ISD::FTRUNC, VT, Legal);
1447 setOperationAction(ISD::FRINT, VT, Legal);
1448 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1451 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1452 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
1453 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1454 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1457 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1458 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1459 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1460 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1462 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1463 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1465 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1466 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1468 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1469 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1470 setOperationAction(ISD::SELECT, MVT::v16i32, Custom);
1471 setOperationAction(ISD::SELECT, MVT::v32i16, Custom);
1472 setOperationAction(ISD::SELECT, MVT::v64i8, Custom);
1473 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1475 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1476 setOperationAction(ISD::SMAX, VT, Legal);
1477 setOperationAction(ISD::UMAX, VT, Legal);
1478 setOperationAction(ISD::SMIN, VT, Legal);
1479 setOperationAction(ISD::UMIN, VT, Legal);
1480 setOperationAction(ISD::ABS, VT, Legal);
1481 setOperationAction(ISD::SRL, VT, Custom);
1482 setOperationAction(ISD::SHL, VT, Custom);
1483 setOperationAction(ISD::SRA, VT, Custom);
1484 setOperationAction(ISD::CTPOP, VT, Custom);
1485 setOperationAction(ISD::ROTL, VT, Custom);
1486 setOperationAction(ISD::ROTR, VT, Custom);
1487 setOperationAction(ISD::SETCC, VT, Custom);
1489 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1490 // setcc all the way to isel and prefer SETGT in some isel patterns.
1491 setCondCodeAction(ISD::SETLT, VT, Custom);
1492 setCondCodeAction(ISD::SETLE, VT, Custom);
1495 if (Subtarget.hasDQI()) {
1496 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1497 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1498 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1499 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1501 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1504 if (Subtarget.hasCDI()) {
1505 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1506 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1507 setOperationAction(ISD::CTLZ, VT, Legal);
1509 } // Subtarget.hasCDI()
1511 if (Subtarget.hasVPOPCNTDQ()) {
1512 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1513 setOperationAction(ISD::CTPOP, VT, Legal);
1516 // Extract subvector is special because the value type
1517 // (result) is 256-bit but the source is 512-bit wide.
1518 // 128-bit was made Legal under AVX1.
1519 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1520 MVT::v8f32, MVT::v4f64 })
1521 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1523 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1524 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1525 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1526 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1527 setOperationAction(ISD::VSELECT, VT, Custom);
1528 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1529 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1530 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1531 setOperationAction(ISD::MLOAD, VT, Legal);
1532 setOperationAction(ISD::MSTORE, VT, Legal);
1533 setOperationAction(ISD::MGATHER, VT, Custom);
1534 setOperationAction(ISD::MSCATTER, VT, Custom);
1536 // Need to custom split v32i16/v64i8 bitcasts.
1537 if (!Subtarget.hasBWI()) {
1538 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1539 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1542 if (Subtarget.hasVBMI2()) {
1543 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1544 setOperationAction(ISD::FSHL, VT, Custom);
1545 setOperationAction(ISD::FSHR, VT, Custom);
1550 // This block controls legalization for operations that don't have
1551 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1553 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1554 // These operations are handled on non-VLX by artificially widening in
1556 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1558 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1559 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1560 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1561 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1562 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1564 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1565 setOperationAction(ISD::SMAX, VT, Legal);
1566 setOperationAction(ISD::UMAX, VT, Legal);
1567 setOperationAction(ISD::SMIN, VT, Legal);
1568 setOperationAction(ISD::UMIN, VT, Legal);
1569 setOperationAction(ISD::ABS, VT, Legal);
1572 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1573 setOperationAction(ISD::ROTL, VT, Custom);
1574 setOperationAction(ISD::ROTR, VT, Custom);
1577 // Custom legalize 2x32 to get a little better code.
1578 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1579 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1581 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1582 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1583 setOperationAction(ISD::MSCATTER, VT, Custom);
1585 if (Subtarget.hasDQI()) {
1586 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1587 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1588 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1589 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1590 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1592 setOperationAction(ISD::MUL, VT, Legal);
1596 if (Subtarget.hasCDI()) {
1597 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1598 setOperationAction(ISD::CTLZ, VT, Legal);
1600 } // Subtarget.hasCDI()
1602 if (Subtarget.hasVPOPCNTDQ()) {
1603 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1604 setOperationAction(ISD::CTPOP, VT, Legal);
1608 // This block control legalization of v32i1/v64i1 which are available with
1609 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1611 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1612 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1613 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1615 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1616 setOperationAction(ISD::ADD, VT, Custom);
1617 setOperationAction(ISD::SUB, VT, Custom);
1618 setOperationAction(ISD::MUL, VT, Custom);
1619 setOperationAction(ISD::VSELECT, VT, Expand);
1620 setOperationAction(ISD::UADDSAT, VT, Custom);
1621 setOperationAction(ISD::SADDSAT, VT, Custom);
1622 setOperationAction(ISD::USUBSAT, VT, Custom);
1623 setOperationAction(ISD::SSUBSAT, VT, Custom);
1625 setOperationAction(ISD::TRUNCATE, VT, Custom);
1626 setOperationAction(ISD::SETCC, VT, Custom);
1627 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1628 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1629 setOperationAction(ISD::SELECT, VT, Custom);
1630 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1631 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1634 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1635 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1636 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1637 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1638 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1639 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1641 // Extends from v32i1 masks to 256-bit vectors.
1642 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1643 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1644 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1647 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1648 // disabled based on prefer-vector-width and required-vector-width function
1650 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1651 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1652 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1654 // Extends from v64i1 masks to 512-bit vectors.
1655 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1656 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1657 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1659 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1660 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1661 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1662 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1663 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1664 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1665 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1666 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1667 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1668 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1669 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1670 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1671 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1672 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1673 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1674 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1675 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1676 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1677 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1678 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1679 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1680 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1681 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1683 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1684 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1686 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1688 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1689 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1690 setOperationAction(ISD::VSELECT, VT, Custom);
1691 setOperationAction(ISD::ABS, VT, Legal);
1692 setOperationAction(ISD::SRL, VT, Custom);
1693 setOperationAction(ISD::SHL, VT, Custom);
1694 setOperationAction(ISD::SRA, VT, Custom);
1695 setOperationAction(ISD::MLOAD, VT, Legal);
1696 setOperationAction(ISD::MSTORE, VT, Legal);
1697 setOperationAction(ISD::CTPOP, VT, Custom);
1698 setOperationAction(ISD::CTLZ, VT, Custom);
1699 setOperationAction(ISD::SMAX, VT, Legal);
1700 setOperationAction(ISD::UMAX, VT, Legal);
1701 setOperationAction(ISD::SMIN, VT, Legal);
1702 setOperationAction(ISD::UMIN, VT, Legal);
1703 setOperationAction(ISD::SETCC, VT, Custom);
1704 setOperationAction(ISD::UADDSAT, VT, Legal);
1705 setOperationAction(ISD::SADDSAT, VT, Legal);
1706 setOperationAction(ISD::USUBSAT, VT, Legal);
1707 setOperationAction(ISD::SSUBSAT, VT, Legal);
1709 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1710 // setcc all the way to isel and prefer SETGT in some isel patterns.
1711 setCondCodeAction(ISD::SETLT, VT, Custom);
1712 setCondCodeAction(ISD::SETLE, VT, Custom);
1715 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1716 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1719 if (Subtarget.hasBITALG()) {
1720 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1721 setOperationAction(ISD::CTPOP, VT, Legal);
1724 if (Subtarget.hasVBMI2()) {
1725 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1726 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1730 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1731 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1732 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1733 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1736 // These operations are handled on non-VLX by artificially widening in
1738 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1740 if (Subtarget.hasBITALG()) {
1741 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1742 setOperationAction(ISD::CTPOP, VT, Legal);
1746 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1747 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1748 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1749 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1750 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1751 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1753 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1754 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1755 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1756 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1757 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1759 if (Subtarget.hasDQI()) {
1760 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1761 // v2f32 UINT_TO_FP is already custom under SSE2.
1762 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1763 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1764 "Unexpected operation action!");
1765 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1766 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1767 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1770 if (Subtarget.hasBWI()) {
1771 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1772 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1775 if (Subtarget.hasVBMI2()) {
1776 // TODO: Make these legal even without VLX?
1777 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1778 MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1779 setOperationAction(ISD::FSHL, VT, Custom);
1780 setOperationAction(ISD::FSHR, VT, Custom);
1785 // We want to custom lower some of our intrinsics.
1786 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1787 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1788 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1789 if (!Subtarget.is64Bit()) {
1790 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1791 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1794 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1795 // handle type legalization for these operations here.
1797 // FIXME: We really should do custom legalization for addition and
1798 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1799 // than generic legalization for 64-bit multiplication-with-overflow, though.
1800 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1801 if (VT == MVT::i64 && !Subtarget.is64Bit())
1803 // Add/Sub/Mul with overflow operations are custom lowered.
1804 setOperationAction(ISD::SADDO, VT, Custom);
1805 setOperationAction(ISD::UADDO, VT, Custom);
1806 setOperationAction(ISD::SSUBO, VT, Custom);
1807 setOperationAction(ISD::USUBO, VT, Custom);
1808 setOperationAction(ISD::SMULO, VT, Custom);
1809 setOperationAction(ISD::UMULO, VT, Custom);
1811 // Support carry in as value rather than glue.
1812 setOperationAction(ISD::ADDCARRY, VT, Custom);
1813 setOperationAction(ISD::SUBCARRY, VT, Custom);
1814 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1817 if (!Subtarget.is64Bit()) {
1818 // These libcalls are not available in 32-bit.
1819 setLibcallName(RTLIB::SHL_I128, nullptr);
1820 setLibcallName(RTLIB::SRL_I128, nullptr);
1821 setLibcallName(RTLIB::SRA_I128, nullptr);
1822 setLibcallName(RTLIB::MUL_I128, nullptr);
1825 // Combine sin / cos into _sincos_stret if it is available.
1826 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1827 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1828 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1829 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1832 if (Subtarget.isTargetWin64()) {
1833 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1834 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1835 setOperationAction(ISD::SREM, MVT::i128, Custom);
1836 setOperationAction(ISD::UREM, MVT::i128, Custom);
1837 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1838 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1841 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1842 // is. We should promote the value to 64-bits to solve this.
1843 // This is what the CRT headers do - `fmodf` is an inline header
1844 // function casting to f64 and calling `fmod`.
1845 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1846 Subtarget.isTargetWindowsItanium()))
1847 for (ISD::NodeType Op :
1848 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1849 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1850 if (isOperationExpand(Op, MVT::f32))
1851 setOperationAction(Op, MVT::f32, Promote);
1853 // We have target-specific dag combine patterns for the following nodes:
1854 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1855 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1856 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1857 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1858 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1859 setTargetDAGCombine(ISD::BITCAST);
1860 setTargetDAGCombine(ISD::VSELECT);
1861 setTargetDAGCombine(ISD::SELECT);
1862 setTargetDAGCombine(ISD::SHL);
1863 setTargetDAGCombine(ISD::SRA);
1864 setTargetDAGCombine(ISD::SRL);
1865 setTargetDAGCombine(ISD::OR);
1866 setTargetDAGCombine(ISD::AND);
1867 setTargetDAGCombine(ISD::ADD);
1868 setTargetDAGCombine(ISD::FADD);
1869 setTargetDAGCombine(ISD::FSUB);
1870 setTargetDAGCombine(ISD::FNEG);
1871 setTargetDAGCombine(ISD::FMA);
1872 setTargetDAGCombine(ISD::FMINNUM);
1873 setTargetDAGCombine(ISD::FMAXNUM);
1874 setTargetDAGCombine(ISD::SUB);
1875 setTargetDAGCombine(ISD::LOAD);
1876 setTargetDAGCombine(ISD::MLOAD);
1877 setTargetDAGCombine(ISD::STORE);
1878 setTargetDAGCombine(ISD::MSTORE);
1879 setTargetDAGCombine(ISD::TRUNCATE);
1880 setTargetDAGCombine(ISD::ZERO_EXTEND);
1881 setTargetDAGCombine(ISD::ANY_EXTEND);
1882 setTargetDAGCombine(ISD::SIGN_EXTEND);
1883 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1884 setTargetDAGCombine(ISD::SINT_TO_FP);
1885 setTargetDAGCombine(ISD::UINT_TO_FP);
1886 setTargetDAGCombine(ISD::SETCC);
1887 setTargetDAGCombine(ISD::MUL);
1888 setTargetDAGCombine(ISD::XOR);
1889 setTargetDAGCombine(ISD::MSCATTER);
1890 setTargetDAGCombine(ISD::MGATHER);
1892 computeRegisterProperties(Subtarget.getRegisterInfo());
1894 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1895 MaxStoresPerMemsetOptSize = 8;
1896 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1897 MaxStoresPerMemcpyOptSize = 4;
1898 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1899 MaxStoresPerMemmoveOptSize = 4;
1901 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1902 // that needs to benchmarked and balanced with the potential use of vector
1903 // load/store types (PR33329, PR33914).
1904 MaxLoadsPerMemcmp = 2;
1905 MaxLoadsPerMemcmpOptSize = 2;
1907 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1908 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1910 // An out-of-order CPU can speculatively execute past a predictable branch,
1911 // but a conditional move could be stalled by an expensive earlier operation.
1912 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1913 EnableExtLdPromotion = true;
1914 setPrefFunctionAlignment(4); // 2^4 bytes.
1916 verifyIntrinsicTables();
1919 // This has so far only been implemented for 64-bit MachO.
1920 bool X86TargetLowering::useLoadStackGuardNode() const {
1921 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1924 bool X86TargetLowering::useStackGuardXorFP() const {
1925 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1926 return Subtarget.getTargetTriple().isOSMSVCRT();
1929 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1930 const SDLoc &DL) const {
1931 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1932 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1933 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1934 return SDValue(Node, 0);
1937 TargetLoweringBase::LegalizeTypeAction
1938 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
1939 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1940 return TypeSplitVector;
1942 if (ExperimentalVectorWideningLegalization &&
1943 VT.getVectorNumElements() != 1 &&
1944 VT.getVectorElementType() != MVT::i1)
1945 return TypeWidenVector;
1947 return TargetLoweringBase::getPreferredVectorAction(VT);
1950 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1953 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1955 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1958 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1961 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1963 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1966 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1967 LLVMContext& Context,
1972 if (Subtarget.hasAVX512()) {
1973 const unsigned NumElts = VT.getVectorNumElements();
1975 // Figure out what this type will be legalized to.
1977 while (getTypeAction(Context, LegalVT) != TypeLegal)
1978 LegalVT = getTypeToTransformTo(Context, LegalVT);
1980 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1981 if (LegalVT.getSimpleVT().is512BitVector())
1982 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1984 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1985 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1986 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1988 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1989 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1990 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1994 return VT.changeVectorElementTypeToInteger();
1997 /// Helper for getByValTypeAlignment to determine
1998 /// the desired ByVal argument alignment.
1999 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
2002 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2003 if (VTy->getBitWidth() == 128)
2005 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2006 unsigned EltAlign = 0;
2007 getMaxByValAlign(ATy->getElementType(), EltAlign);
2008 if (EltAlign > MaxAlign)
2009 MaxAlign = EltAlign;
2010 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2011 for (auto *EltTy : STy->elements()) {
2012 unsigned EltAlign = 0;
2013 getMaxByValAlign(EltTy, EltAlign);
2014 if (EltAlign > MaxAlign)
2015 MaxAlign = EltAlign;
2022 /// Return the desired alignment for ByVal aggregate
2023 /// function arguments in the caller parameter area. For X86, aggregates
2024 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
2025 /// are at 4-byte boundaries.
2026 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2027 const DataLayout &DL) const {
2028 if (Subtarget.is64Bit()) {
2029 // Max of 8 and alignment of type.
2030 unsigned TyAlign = DL.getABITypeAlignment(Ty);
2037 if (Subtarget.hasSSE1())
2038 getMaxByValAlign(Ty, Align);
2042 /// Returns the target specific optimal type for load
2043 /// and store operations as a result of memset, memcpy, and memmove
2044 /// lowering. If DstAlign is zero that means it's safe to destination
2045 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
2046 /// means there isn't a need to check it against alignment requirement,
2047 /// probably because the source does not need to be loaded. If 'IsMemset' is
2048 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
2049 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
2050 /// source is constant so it does not need to be loaded.
2051 /// It returns EVT::Other if the type should be determined using generic
2052 /// target-independent logic.
2054 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
2055 unsigned DstAlign, unsigned SrcAlign,
2056 bool IsMemset, bool ZeroMemset,
2058 MachineFunction &MF) const {
2059 const Function &F = MF.getFunction();
2060 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
2062 (!Subtarget.isUnalignedMem16Slow() ||
2063 ((DstAlign == 0 || DstAlign >= 16) &&
2064 (SrcAlign == 0 || SrcAlign >= 16)))) {
2065 // FIXME: Check if unaligned 32-byte accesses are slow.
2066 if (Size >= 32 && Subtarget.hasAVX()) {
2067 // Although this isn't a well-supported type for AVX1, we'll let
2068 // legalization and shuffle lowering produce the optimal codegen. If we
2069 // choose an optimal type with a vector element larger than a byte,
2070 // getMemsetStores() may create an intermediate splat (using an integer
2071 // multiply) before we splat as a vector.
2074 if (Subtarget.hasSSE2())
2076 // TODO: Can SSE1 handle a byte vector?
2077 // If we have SSE1 registers we should be able to use them.
2078 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
2080 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
2081 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2082 // Do not use f64 to lower memcpy if source is string constant. It's
2083 // better to use i32 to avoid the loads.
2084 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2085 // The gymnastics of splatting a byte value into an XMM register and then
2086 // only using 8-byte stores (because this is a CPU with slow unaligned
2087 // 16-byte accesses) makes that a loser.
2091 // This is a compromise. If we reach here, unaligned accesses may be slow on
2092 // this target. However, creating smaller, aligned accesses could be even
2093 // slower and would certainly be a lot more code.
2094 if (Subtarget.is64Bit() && Size >= 8)
2099 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2101 return X86ScalarSSEf32;
2102 else if (VT == MVT::f64)
2103 return X86ScalarSSEf64;
2108 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
2113 switch (VT.getSizeInBits()) {
2115 // 8-byte and under are always assumed to be fast.
2119 *Fast = !Subtarget.isUnalignedMem16Slow();
2122 *Fast = !Subtarget.isUnalignedMem32Slow();
2124 // TODO: What about AVX-512 (512-bit) accesses?
2127 // Misaligned accesses of any size are always allowed.
2131 /// Return the entry encoding for a jump table in the
2132 /// current function. The returned value is a member of the
2133 /// MachineJumpTableInfo::JTEntryKind enum.
2134 unsigned X86TargetLowering::getJumpTableEncoding() const {
2135 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2137 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2138 return MachineJumpTableInfo::EK_Custom32;
2140 // Otherwise, use the normal jump table encoding heuristics.
2141 return TargetLowering::getJumpTableEncoding();
2144 bool X86TargetLowering::useSoftFloat() const {
2145 return Subtarget.useSoftFloat();
2148 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2149 ArgListTy &Args) const {
2151 // Only relabel X86-32 for C / Stdcall CCs.
2152 if (Subtarget.is64Bit())
2154 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2156 unsigned ParamRegs = 0;
2157 if (auto *M = MF->getFunction().getParent())
2158 ParamRegs = M->getNumberRegisterParameters();
2160 // Mark the first N int arguments as having reg
2161 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2162 Type *T = Args[Idx].Ty;
2163 if (T->isIntOrPtrTy())
2164 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2165 unsigned numRegs = 1;
2166 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2168 if (ParamRegs < numRegs)
2170 ParamRegs -= numRegs;
2171 Args[Idx].IsInReg = true;
2177 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2178 const MachineBasicBlock *MBB,
2179 unsigned uid,MCContext &Ctx) const{
2180 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2181 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2183 return MCSymbolRefExpr::create(MBB->getSymbol(),
2184 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2187 /// Returns relocation base for the given PIC jumptable.
2188 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2189 SelectionDAG &DAG) const {
2190 if (!Subtarget.is64Bit())
2191 // This doesn't have SDLoc associated with it, but is not really the
2192 // same as a Register.
2193 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2194 getPointerTy(DAG.getDataLayout()));
2198 /// This returns the relocation base for the given PIC jumptable,
2199 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2200 const MCExpr *X86TargetLowering::
2201 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2202 MCContext &Ctx) const {
2203 // X86-64 uses RIP relative addressing based on the jump table label.
2204 if (Subtarget.isPICStyleRIPRel())
2205 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2207 // Otherwise, the reference is relative to the PIC base.
2208 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2211 std::pair<const TargetRegisterClass *, uint8_t>
2212 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2214 const TargetRegisterClass *RRC = nullptr;
2216 switch (VT.SimpleTy) {
2218 return TargetLowering::findRepresentativeClass(TRI, VT);
2219 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2220 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2223 RRC = &X86::VR64RegClass;
2225 case MVT::f32: case MVT::f64:
2226 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2227 case MVT::v4f32: case MVT::v2f64:
2228 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2229 case MVT::v8f32: case MVT::v4f64:
2230 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2231 case MVT::v16f32: case MVT::v8f64:
2232 RRC = &X86::VR128XRegClass;
2235 return std::make_pair(RRC, Cost);
2238 unsigned X86TargetLowering::getAddressSpace() const {
2239 if (Subtarget.is64Bit())
2240 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2244 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2245 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2246 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2249 static Constant* SegmentOffset(IRBuilder<> &IRB,
2250 unsigned Offset, unsigned AddressSpace) {
2251 return ConstantExpr::getIntToPtr(
2252 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2253 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2256 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2257 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2258 // tcbhead_t; use it instead of the usual global variable (see
2259 // sysdeps/{i386,x86_64}/nptl/tls.h)
2260 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2261 if (Subtarget.isTargetFuchsia()) {
2262 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2263 return SegmentOffset(IRB, 0x10, getAddressSpace());
2265 // %fs:0x28, unless we're using a Kernel code model, in which case
2266 // it's %gs:0x28. gs:0x14 on i386.
2267 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2268 return SegmentOffset(IRB, Offset, getAddressSpace());
2272 return TargetLowering::getIRStackGuard(IRB);
2275 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2276 // MSVC CRT provides functionalities for stack protection.
2277 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2278 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2279 // MSVC CRT has a global variable holding security cookie.
2280 M.getOrInsertGlobal("__security_cookie",
2281 Type::getInt8PtrTy(M.getContext()));
2283 // MSVC CRT has a function to validate security cookie.
2284 auto *SecurityCheckCookie = cast<Function>(
2285 M.getOrInsertFunction("__security_check_cookie",
2286 Type::getVoidTy(M.getContext()),
2287 Type::getInt8PtrTy(M.getContext())));
2288 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2289 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2292 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2293 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2295 TargetLowering::insertSSPDeclarations(M);
2298 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2299 // MSVC CRT has a global variable holding security cookie.
2300 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2301 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2302 return M.getGlobalVariable("__security_cookie");
2304 return TargetLowering::getSDagStackGuard(M);
2307 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2308 // MSVC CRT has a function to validate security cookie.
2309 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2310 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2311 return M.getFunction("__security_check_cookie");
2313 return TargetLowering::getSSPStackGuardCheck(M);
2316 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2317 if (Subtarget.getTargetTriple().isOSContiki())
2318 return getDefaultSafeStackPointerLocation(IRB, false);
2320 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2321 // definition of TLS_SLOT_SAFESTACK in
2322 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2323 if (Subtarget.isTargetAndroid()) {
2324 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2326 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2327 return SegmentOffset(IRB, Offset, getAddressSpace());
2330 // Fuchsia is similar.
2331 if (Subtarget.isTargetFuchsia()) {
2332 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2333 return SegmentOffset(IRB, 0x18, getAddressSpace());
2336 return TargetLowering::getSafeStackPointerLocation(IRB);
2339 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2340 unsigned DestAS) const {
2341 assert(SrcAS != DestAS && "Expected different address spaces!");
2343 return SrcAS < 256 && DestAS < 256;
2346 //===----------------------------------------------------------------------===//
2347 // Return Value Calling Convention Implementation
2348 //===----------------------------------------------------------------------===//
2350 #include "X86GenCallingConv.inc"
2352 bool X86TargetLowering::CanLowerReturn(
2353 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2354 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2355 SmallVector<CCValAssign, 16> RVLocs;
2356 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2357 return CCInfo.CheckReturn(Outs, RetCC_X86);
2360 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2361 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2365 /// Lowers masks values (v*i1) to the local register values
2366 /// \returns DAG node after lowering to register type
2367 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2368 const SDLoc &Dl, SelectionDAG &DAG) {
2369 EVT ValVT = ValArg.getValueType();
2371 if (ValVT == MVT::v1i1)
2372 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2373 DAG.getIntPtrConstant(0, Dl));
2375 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2376 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2377 // Two stage lowering might be required
2378 // bitcast: v8i1 -> i8 / v16i1 -> i16
2379 // anyextend: i8 -> i32 / i16 -> i32
2380 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2381 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2382 if (ValLoc == MVT::i32)
2383 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2387 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2388 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2389 // One stage lowering is required
2390 // bitcast: v32i1 -> i32 / v64i1 -> i64
2391 return DAG.getBitcast(ValLoc, ValArg);
2394 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2397 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2398 static void Passv64i1ArgInRegs(
2399 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2400 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2401 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2402 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2403 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2404 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2405 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2406 "The value should reside in two registers");
2408 // Before splitting the value we cast it to i64
2409 Arg = DAG.getBitcast(MVT::i64, Arg);
2411 // Splitting the value into two i32 types
2413 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2414 DAG.getConstant(0, Dl, MVT::i32));
2415 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2416 DAG.getConstant(1, Dl, MVT::i32));
2418 // Attach the two i32 types into corresponding registers
2419 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2420 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2424 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2426 const SmallVectorImpl<ISD::OutputArg> &Outs,
2427 const SmallVectorImpl<SDValue> &OutVals,
2428 const SDLoc &dl, SelectionDAG &DAG) const {
2429 MachineFunction &MF = DAG.getMachineFunction();
2430 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2432 // In some cases we need to disable registers from the default CSR list.
2433 // For example, when they are used for argument passing.
2434 bool ShouldDisableCalleeSavedRegister =
2435 CallConv == CallingConv::X86_RegCall ||
2436 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2438 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2439 report_fatal_error("X86 interrupts may not return any value");
2441 SmallVector<CCValAssign, 16> RVLocs;
2442 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2443 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2446 SmallVector<SDValue, 6> RetOps;
2447 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2448 // Operand #1 = Bytes To Pop
2449 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2452 // Copy the result values into the output registers.
2453 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2455 CCValAssign &VA = RVLocs[I];
2456 assert(VA.isRegLoc() && "Can only return in registers!");
2458 // Add the register to the CalleeSaveDisableRegs list.
2459 if (ShouldDisableCalleeSavedRegister)
2460 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2462 SDValue ValToCopy = OutVals[OutsIndex];
2463 EVT ValVT = ValToCopy.getValueType();
2465 // Promote values to the appropriate types.
2466 if (VA.getLocInfo() == CCValAssign::SExt)
2467 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2468 else if (VA.getLocInfo() == CCValAssign::ZExt)
2469 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2470 else if (VA.getLocInfo() == CCValAssign::AExt) {
2471 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2472 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2474 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2476 else if (VA.getLocInfo() == CCValAssign::BCvt)
2477 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2479 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2480 "Unexpected FP-extend for return value.");
2482 // If this is x86-64, and we disabled SSE, we can't return FP values,
2483 // or SSE or MMX vectors.
2484 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2485 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2486 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2487 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2488 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2489 } else if (ValVT == MVT::f64 &&
2490 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2491 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2492 // llvm-gcc has never done it right and no one has noticed, so this
2493 // should be OK for now.
2494 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2495 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2498 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2499 // the RET instruction and handled by the FP Stackifier.
2500 if (VA.getLocReg() == X86::FP0 ||
2501 VA.getLocReg() == X86::FP1) {
2502 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2503 // change the value to the FP stack register class.
2504 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2505 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2506 RetOps.push_back(ValToCopy);
2507 // Don't emit a copytoreg.
2511 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2512 // which is returned in RAX / RDX.
2513 if (Subtarget.is64Bit()) {
2514 if (ValVT == MVT::x86mmx) {
2515 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2516 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2517 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2519 // If we don't have SSE2 available, convert to v4f32 so the generated
2520 // register is legal.
2521 if (!Subtarget.hasSSE2())
2522 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2527 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2529 if (VA.needsCustom()) {
2530 assert(VA.getValVT() == MVT::v64i1 &&
2531 "Currently the only custom case is when we split v64i1 to 2 regs");
2533 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2536 assert(2 == RegsToPass.size() &&
2537 "Expecting two registers after Pass64BitArgInRegs");
2539 // Add the second register to the CalleeSaveDisableRegs list.
2540 if (ShouldDisableCalleeSavedRegister)
2541 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2543 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2546 // Add nodes to the DAG and add the values into the RetOps list
2547 for (auto &Reg : RegsToPass) {
2548 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2549 Flag = Chain.getValue(1);
2550 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2554 // Swift calling convention does not require we copy the sret argument
2555 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2557 // All x86 ABIs require that for returning structs by value we copy
2558 // the sret argument into %rax/%eax (depending on ABI) for the return.
2559 // We saved the argument into a virtual register in the entry block,
2560 // so now we copy the value out and into %rax/%eax.
2562 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2563 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2564 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2565 // either case FuncInfo->setSRetReturnReg() will have been called.
2566 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2567 // When we have both sret and another return value, we should use the
2568 // original Chain stored in RetOps[0], instead of the current Chain updated
2569 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2571 // For the case of sret and another return value, we have
2572 // Chain_0 at the function entry
2573 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2574 // If we use Chain_1 in getCopyFromReg, we will have
2575 // Val = getCopyFromReg(Chain_1)
2576 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2578 // getCopyToReg(Chain_0) will be glued together with
2579 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2580 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2581 // Data dependency from Unit B to Unit A due to usage of Val in
2582 // getCopyToReg(Chain_1, Val)
2583 // Chain dependency from Unit A to Unit B
2585 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2586 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2587 getPointerTy(MF.getDataLayout()));
2590 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2591 X86::RAX : X86::EAX;
2592 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2593 Flag = Chain.getValue(1);
2595 // RAX/EAX now acts like a return value.
2597 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2599 // Add the returned register to the CalleeSaveDisableRegs list.
2600 if (ShouldDisableCalleeSavedRegister)
2601 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2604 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2605 const MCPhysReg *I =
2606 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2609 if (X86::GR64RegClass.contains(*I))
2610 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2612 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2616 RetOps[0] = Chain; // Update chain.
2618 // Add the flag if we have it.
2620 RetOps.push_back(Flag);
2622 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2623 if (CallConv == CallingConv::X86_INTR)
2624 opcode = X86ISD::IRET;
2625 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2628 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2629 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2632 SDValue TCChain = Chain;
2633 SDNode *Copy = *N->use_begin();
2634 if (Copy->getOpcode() == ISD::CopyToReg) {
2635 // If the copy has a glue operand, we conservatively assume it isn't safe to
2636 // perform a tail call.
2637 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2639 TCChain = Copy->getOperand(0);
2640 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2643 bool HasRet = false;
2644 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2646 if (UI->getOpcode() != X86ISD::RET_FLAG)
2648 // If we are returning more than one value, we can definitely
2649 // not make a tail call see PR19530
2650 if (UI->getNumOperands() > 4)
2652 if (UI->getNumOperands() == 4 &&
2653 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2665 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2666 ISD::NodeType ExtendKind) const {
2667 MVT ReturnMVT = MVT::i32;
2669 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2670 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2671 // The ABI does not require i1, i8 or i16 to be extended.
2673 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2674 // always extending i8/i16 return values, so keep doing that for now.
2676 ReturnMVT = MVT::i8;
2679 EVT MinVT = getRegisterType(Context, ReturnMVT);
2680 return VT.bitsLT(MinVT) ? MinVT : VT;
2683 /// Reads two 32 bit registers and creates a 64 bit mask value.
2684 /// \param VA The current 32 bit value that need to be assigned.
2685 /// \param NextVA The next 32 bit value that need to be assigned.
2686 /// \param Root The parent DAG node.
2687 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2688 /// glue purposes. In the case the DAG is already using
2689 /// physical register instead of virtual, we should glue
2690 /// our new SDValue to InFlag SDvalue.
2691 /// \return a new SDvalue of size 64bit.
2692 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2693 SDValue &Root, SelectionDAG &DAG,
2694 const SDLoc &Dl, const X86Subtarget &Subtarget,
2695 SDValue *InFlag = nullptr) {
2696 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2697 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2698 assert(VA.getValVT() == MVT::v64i1 &&
2699 "Expecting first location of 64 bit width type");
2700 assert(NextVA.getValVT() == VA.getValVT() &&
2701 "The locations should have the same type");
2702 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2703 "The values should reside in two registers");
2707 SDValue ArgValueLo, ArgValueHi;
2709 MachineFunction &MF = DAG.getMachineFunction();
2710 const TargetRegisterClass *RC = &X86::GR32RegClass;
2712 // Read a 32 bit value from the registers.
2713 if (nullptr == InFlag) {
2714 // When no physical register is present,
2715 // create an intermediate virtual register.
2716 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2717 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2718 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2719 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2721 // When a physical register is available read the value from it and glue
2722 // the reads together.
2724 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2725 *InFlag = ArgValueLo.getValue(2);
2727 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2728 *InFlag = ArgValueHi.getValue(2);
2731 // Convert the i32 type into v32i1 type.
2732 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2734 // Convert the i32 type into v32i1 type.
2735 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2737 // Concatenate the two values together.
2738 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2741 /// The function will lower a register of various sizes (8/16/32/64)
2742 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2743 /// \returns a DAG node contains the operand after lowering to mask type.
2744 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2745 const EVT &ValLoc, const SDLoc &Dl,
2746 SelectionDAG &DAG) {
2747 SDValue ValReturned = ValArg;
2749 if (ValVT == MVT::v1i1)
2750 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2752 if (ValVT == MVT::v64i1) {
2753 // In 32 bit machine, this case is handled by getv64i1Argument
2754 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2755 // In 64 bit machine, There is no need to truncate the value only bitcast
2758 switch (ValVT.getSimpleVT().SimpleTy) {
2769 llvm_unreachable("Expecting a vector of i1 types");
2772 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2774 return DAG.getBitcast(ValVT, ValReturned);
2777 /// Lower the result values of a call into the
2778 /// appropriate copies out of appropriate physical registers.
2780 SDValue X86TargetLowering::LowerCallResult(
2781 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2782 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2783 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2784 uint32_t *RegMask) const {
2786 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2787 // Assign locations to each value returned by this call.
2788 SmallVector<CCValAssign, 16> RVLocs;
2789 bool Is64Bit = Subtarget.is64Bit();
2790 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2792 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2794 // Copy all of the result registers out of their specified physreg.
2795 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2797 CCValAssign &VA = RVLocs[I];
2798 EVT CopyVT = VA.getLocVT();
2800 // In some calling conventions we need to remove the used registers
2801 // from the register mask.
2803 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2804 SubRegs.isValid(); ++SubRegs)
2805 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2808 // If this is x86-64, and we disabled SSE, we can't return FP values
2809 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2810 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2811 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2812 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2815 // If we prefer to use the value in xmm registers, copy it out as f80 and
2816 // use a truncate to move it from fp stack reg to xmm reg.
2817 bool RoundAfterCopy = false;
2818 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2819 isScalarFPTypeInSSEReg(VA.getValVT())) {
2820 if (!Subtarget.hasX87())
2821 report_fatal_error("X87 register return with X87 disabled");
2823 RoundAfterCopy = (CopyVT != VA.getLocVT());
2827 if (VA.needsCustom()) {
2828 assert(VA.getValVT() == MVT::v64i1 &&
2829 "Currently the only custom case is when we split v64i1 to 2 regs");
2831 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2833 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2835 Val = Chain.getValue(0);
2836 InFlag = Chain.getValue(2);
2840 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2841 // This truncation won't change the value.
2842 DAG.getIntPtrConstant(1, dl));
2844 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2845 if (VA.getValVT().isVector() &&
2846 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2847 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2848 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2849 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2851 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2854 InVals.push_back(Val);
2860 //===----------------------------------------------------------------------===//
2861 // C & StdCall & Fast Calling Convention implementation
2862 //===----------------------------------------------------------------------===//
2863 // StdCall calling convention seems to be standard for many Windows' API
2864 // routines and around. It differs from C calling convention just a little:
2865 // callee should clean up the stack, not caller. Symbols should be also
2866 // decorated in some fancy way :) It doesn't support any vector arguments.
2867 // For info on fast calling convention see Fast Calling Convention (tail call)
2868 // implementation LowerX86_32FastCCCallTo.
2870 /// CallIsStructReturn - Determines whether a call uses struct return
2872 enum StructReturnType {
2877 static StructReturnType
2878 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
2880 return NotStructReturn;
2882 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2883 if (!Flags.isSRet())
2884 return NotStructReturn;
2885 if (Flags.isInReg() || IsMCU)
2886 return RegStructReturn;
2887 return StackStructReturn;
2890 /// Determines whether a function uses struct return semantics.
2891 static StructReturnType
2892 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
2894 return NotStructReturn;
2896 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2897 if (!Flags.isSRet())
2898 return NotStructReturn;
2899 if (Flags.isInReg() || IsMCU)
2900 return RegStructReturn;
2901 return StackStructReturn;
2904 /// Make a copy of an aggregate at address specified by "Src" to address
2905 /// "Dst" with size and alignment information specified by the specific
2906 /// parameter attribute. The copy will be passed as a byval function parameter.
2907 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2908 SDValue Chain, ISD::ArgFlagsTy Flags,
2909 SelectionDAG &DAG, const SDLoc &dl) {
2910 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2912 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2913 /*isVolatile*/false, /*AlwaysInline=*/true,
2914 /*isTailCall*/false,
2915 MachinePointerInfo(), MachinePointerInfo());
2918 /// Return true if the calling convention is one that we can guarantee TCO for.
2919 static bool canGuaranteeTCO(CallingConv::ID CC) {
2920 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2921 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2922 CC == CallingConv::HHVM);
2925 /// Return true if we might ever do TCO for calls with this calling convention.
2926 static bool mayTailCallThisCC(CallingConv::ID CC) {
2928 // C calling conventions:
2929 case CallingConv::C:
2930 case CallingConv::Win64:
2931 case CallingConv::X86_64_SysV:
2932 // Callee pop conventions:
2933 case CallingConv::X86_ThisCall:
2934 case CallingConv::X86_StdCall:
2935 case CallingConv::X86_VectorCall:
2936 case CallingConv::X86_FastCall:
2939 return canGuaranteeTCO(CC);
2943 /// Return true if the function is being made into a tailcall target by
2944 /// changing its ABI.
2945 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2946 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2949 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2951 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2952 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2955 ImmutableCallSite CS(CI);
2956 CallingConv::ID CalleeCC = CS.getCallingConv();
2957 if (!mayTailCallThisCC(CalleeCC))
2964 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2965 const SmallVectorImpl<ISD::InputArg> &Ins,
2966 const SDLoc &dl, SelectionDAG &DAG,
2967 const CCValAssign &VA,
2968 MachineFrameInfo &MFI, unsigned i) const {
2969 // Create the nodes corresponding to a load from this parameter slot.
2970 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2971 bool AlwaysUseMutable = shouldGuaranteeTCO(
2972 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2973 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2975 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2977 // If value is passed by pointer we have address passed instead of the value
2978 // itself. No need to extend if the mask value and location share the same
2980 bool ExtendedInMem =
2981 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2982 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2984 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2985 ValVT = VA.getLocVT();
2987 ValVT = VA.getValVT();
2989 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2990 // taken by a return address.
2992 if (CallConv == CallingConv::X86_INTR) {
2993 // X86 interrupts may take one or two arguments.
2994 // On the stack there will be no return address as in regular call.
2995 // Offset of last argument need to be set to -4/-8 bytes.
2996 // Where offset of the first argument out of two, should be set to 0 bytes.
2997 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2998 if (Subtarget.is64Bit() && Ins.size() == 2) {
2999 // The stack pointer needs to be realigned for 64 bit handlers with error
3000 // code, so the argument offset changes by 8 bytes.
3005 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3006 // changed with more analysis.
3007 // In case of tail call optimization mark all arguments mutable. Since they
3008 // could be overwritten by lowering of arguments in case of a tail call.
3009 if (Flags.isByVal()) {
3010 unsigned Bytes = Flags.getByValSize();
3011 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3013 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3014 // can be improved with deeper analysis.
3015 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3016 /*isAliased=*/true);
3017 // Adjust SP offset of interrupt parameter.
3018 if (CallConv == CallingConv::X86_INTR) {
3019 MFI.setObjectOffset(FI, Offset);
3021 return DAG.getFrameIndex(FI, PtrVT);
3024 // This is an argument in memory. We might be able to perform copy elision.
3025 if (Flags.isCopyElisionCandidate()) {
3026 EVT ArgVT = Ins[i].ArgVT;
3028 if (Ins[i].PartOffset == 0) {
3029 // If this is a one-part value or the first part of a multi-part value,
3030 // create a stack object for the entire argument value type and return a
3031 // load from our portion of it. This assumes that if the first part of an
3032 // argument is in memory, the rest will also be in memory.
3033 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3034 /*Immutable=*/false);
3035 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3037 ValVT, dl, Chain, PartAddr,
3038 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3040 // This is not the first piece of an argument in memory. See if there is
3041 // already a fixed stack object including this offset. If so, assume it
3042 // was created by the PartOffset == 0 branch above and create a load from
3043 // the appropriate offset into it.
3044 int64_t PartBegin = VA.getLocMemOffset();
3045 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3046 int FI = MFI.getObjectIndexBegin();
3047 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3048 int64_t ObjBegin = MFI.getObjectOffset(FI);
3049 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3050 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3053 if (MFI.isFixedObjectIndex(FI)) {
3055 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3056 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3058 ValVT, dl, Chain, Addr,
3059 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3060 Ins[i].PartOffset));
3065 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3066 VA.getLocMemOffset(), isImmutable);
3068 // Set SExt or ZExt flag.
3069 if (VA.getLocInfo() == CCValAssign::ZExt) {
3070 MFI.setObjectZExt(FI, true);
3071 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3072 MFI.setObjectSExt(FI, true);
3075 // Adjust SP offset of interrupt parameter.
3076 if (CallConv == CallingConv::X86_INTR) {
3077 MFI.setObjectOffset(FI, Offset);
3080 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3081 SDValue Val = DAG.getLoad(
3082 ValVT, dl, Chain, FIN,
3083 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3084 return ExtendedInMem
3085 ? (VA.getValVT().isVector()
3086 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3087 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3091 // FIXME: Get this from tablegen.
3092 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3093 const X86Subtarget &Subtarget) {
3094 assert(Subtarget.is64Bit());
3096 if (Subtarget.isCallingConvWin64(CallConv)) {
3097 static const MCPhysReg GPR64ArgRegsWin64[] = {
3098 X86::RCX, X86::RDX, X86::R8, X86::R9
3100 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3103 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3104 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3106 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3109 // FIXME: Get this from tablegen.
3110 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3111 CallingConv::ID CallConv,
3112 const X86Subtarget &Subtarget) {
3113 assert(Subtarget.is64Bit());
3114 if (Subtarget.isCallingConvWin64(CallConv)) {
3115 // The XMM registers which might contain var arg parameters are shadowed
3116 // in their paired GPR. So we only need to save the GPR to their home
3118 // TODO: __vectorcall will change this.
3122 const Function &F = MF.getFunction();
3123 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
3124 bool isSoftFloat = Subtarget.useSoftFloat();
3125 assert(!(isSoftFloat && NoImplicitFloatOps) &&
3126 "SSE register cannot be used when SSE is disabled!");
3127 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
3128 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3132 static const MCPhysReg XMMArgRegs64Bit[] = {
3133 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3134 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3136 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3140 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3141 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
3142 [](const CCValAssign &A, const CCValAssign &B) -> bool {
3143 return A.getValNo() < B.getValNo();
3148 SDValue X86TargetLowering::LowerFormalArguments(
3149 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3150 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3151 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3152 MachineFunction &MF = DAG.getMachineFunction();
3153 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3154 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3156 const Function &F = MF.getFunction();
3157 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3158 F.getName() == "main")
3159 FuncInfo->setForceFramePointer(true);
3161 MachineFrameInfo &MFI = MF.getFrameInfo();
3162 bool Is64Bit = Subtarget.is64Bit();
3163 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3166 !(isVarArg && canGuaranteeTCO(CallConv)) &&
3167 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3169 if (CallConv == CallingConv::X86_INTR) {
3170 bool isLegal = Ins.size() == 1 ||
3171 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
3172 (!Is64Bit && Ins[1].VT == MVT::i32)));
3174 report_fatal_error("X86 interrupts may take one or two arguments");
3177 // Assign locations to all of the incoming arguments.
3178 SmallVector<CCValAssign, 16> ArgLocs;
3179 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3181 // Allocate shadow area for Win64.
3183 CCInfo.AllocateStack(32, 8);
3185 CCInfo.AnalyzeArguments(Ins, CC_X86);
3187 // In vectorcall calling convention a second pass is required for the HVA
3189 if (CallingConv::X86_VectorCall == CallConv) {
3190 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3193 // The next loop assumes that the locations are in the same order of the
3195 assert(isSortedByValueNo(ArgLocs) &&
3196 "Argument Location list must be sorted before lowering");
3199 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3201 assert(InsIndex < Ins.size() && "Invalid Ins index");
3202 CCValAssign &VA = ArgLocs[I];
3204 if (VA.isRegLoc()) {
3205 EVT RegVT = VA.getLocVT();
3206 if (VA.needsCustom()) {
3208 VA.getValVT() == MVT::v64i1 &&
3209 "Currently the only custom case is when we split v64i1 to 2 regs");
3211 // v64i1 values, in regcall calling convention, that are
3212 // compiled to 32 bit arch, are split up into two registers.
3214 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3216 const TargetRegisterClass *RC;
3217 if (RegVT == MVT::i8)
3218 RC = &X86::GR8RegClass;
3219 else if (RegVT == MVT::i16)
3220 RC = &X86::GR16RegClass;
3221 else if (RegVT == MVT::i32)
3222 RC = &X86::GR32RegClass;
3223 else if (Is64Bit && RegVT == MVT::i64)
3224 RC = &X86::GR64RegClass;
3225 else if (RegVT == MVT::f32)
3226 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3227 else if (RegVT == MVT::f64)
3228 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3229 else if (RegVT == MVT::f80)
3230 RC = &X86::RFP80RegClass;
3231 else if (RegVT == MVT::f128)
3232 RC = &X86::VR128RegClass;
3233 else if (RegVT.is512BitVector())
3234 RC = &X86::VR512RegClass;
3235 else if (RegVT.is256BitVector())
3236 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3237 else if (RegVT.is128BitVector())
3238 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3239 else if (RegVT == MVT::x86mmx)
3240 RC = &X86::VR64RegClass;
3241 else if (RegVT == MVT::v1i1)
3242 RC = &X86::VK1RegClass;
3243 else if (RegVT == MVT::v8i1)
3244 RC = &X86::VK8RegClass;
3245 else if (RegVT == MVT::v16i1)
3246 RC = &X86::VK16RegClass;
3247 else if (RegVT == MVT::v32i1)
3248 RC = &X86::VK32RegClass;
3249 else if (RegVT == MVT::v64i1)
3250 RC = &X86::VK64RegClass;
3252 llvm_unreachable("Unknown argument type!");
3254 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3255 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3258 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3259 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3261 if (VA.getLocInfo() == CCValAssign::SExt)
3262 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3263 DAG.getValueType(VA.getValVT()));
3264 else if (VA.getLocInfo() == CCValAssign::ZExt)
3265 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3266 DAG.getValueType(VA.getValVT()));
3267 else if (VA.getLocInfo() == CCValAssign::BCvt)
3268 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3270 if (VA.isExtInLoc()) {
3271 // Handle MMX values passed in XMM regs.
3272 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3273 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3274 else if (VA.getValVT().isVector() &&
3275 VA.getValVT().getScalarType() == MVT::i1 &&
3276 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3277 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3278 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3279 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3281 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3284 assert(VA.isMemLoc());
3286 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3289 // If value is passed via pointer - do a load.
3290 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3292 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3294 InVals.push_back(ArgValue);
3297 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3298 // Swift calling convention does not require we copy the sret argument
3299 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3300 if (CallConv == CallingConv::Swift)
3303 // All x86 ABIs require that for returning structs by value we copy the
3304 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3305 // the argument into a virtual register so that we can access it from the
3307 if (Ins[I].Flags.isSRet()) {
3308 unsigned Reg = FuncInfo->getSRetReturnReg();
3310 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3311 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3312 FuncInfo->setSRetReturnReg(Reg);
3314 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3315 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3320 unsigned StackSize = CCInfo.getNextStackOffset();
3321 // Align stack specially for tail calls.
3322 if (shouldGuaranteeTCO(CallConv,
3323 MF.getTarget().Options.GuaranteedTailCallOpt))
3324 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3326 // If the function takes variable number of arguments, make a frame index for
3327 // the start of the first vararg value... for expansion of llvm.va_start. We
3328 // can skip this if there are no va_start calls.
3329 if (MFI.hasVAStart() &&
3330 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3331 CallConv != CallingConv::X86_ThisCall))) {
3332 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3335 // Figure out if XMM registers are in use.
3336 assert(!(Subtarget.useSoftFloat() &&
3337 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3338 "SSE register cannot be used when SSE is disabled!");
3340 // 64-bit calling conventions support varargs and register parameters, so we
3341 // have to do extra work to spill them in the prologue.
3342 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3343 // Find the first unallocated argument registers.
3344 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3345 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3346 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3347 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3348 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3349 "SSE register cannot be used when SSE is disabled!");
3351 // Gather all the live in physical registers.
3352 SmallVector<SDValue, 6> LiveGPRs;
3353 SmallVector<SDValue, 8> LiveXMMRegs;
3355 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3356 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3358 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3360 if (!ArgXMMs.empty()) {
3361 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3362 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3363 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3364 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3365 LiveXMMRegs.push_back(
3366 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3371 // Get to the caller-allocated home save location. Add 8 to account
3372 // for the return address.
3373 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3374 FuncInfo->setRegSaveFrameIndex(
3375 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3376 // Fixup to set vararg frame on shadow area (4 x i64).
3378 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3380 // For X86-64, if there are vararg parameters that are passed via
3381 // registers, then we must store them to their spots on the stack so
3382 // they may be loaded by dereferencing the result of va_next.
3383 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3384 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3385 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3386 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3389 // Store the integer parameter registers.
3390 SmallVector<SDValue, 8> MemOps;
3391 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3392 getPointerTy(DAG.getDataLayout()));
3393 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3394 for (SDValue Val : LiveGPRs) {
3395 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3396 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3398 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3399 MachinePointerInfo::getFixedStack(
3400 DAG.getMachineFunction(),
3401 FuncInfo->getRegSaveFrameIndex(), Offset));
3402 MemOps.push_back(Store);
3406 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3407 // Now store the XMM (fp + vector) parameter registers.
3408 SmallVector<SDValue, 12> SaveXMMOps;
3409 SaveXMMOps.push_back(Chain);
3410 SaveXMMOps.push_back(ALVal);
3411 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3412 FuncInfo->getRegSaveFrameIndex(), dl));
3413 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3414 FuncInfo->getVarArgsFPOffset(), dl));
3415 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3417 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3418 MVT::Other, SaveXMMOps));
3421 if (!MemOps.empty())
3422 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3425 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3426 // Find the largest legal vector type.
3427 MVT VecVT = MVT::Other;
3428 // FIXME: Only some x86_32 calling conventions support AVX512.
3429 if (Subtarget.hasAVX512() &&
3430 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3431 CallConv == CallingConv::Intel_OCL_BI)))
3432 VecVT = MVT::v16f32;
3433 else if (Subtarget.hasAVX())
3435 else if (Subtarget.hasSSE2())
3438 // We forward some GPRs and some vector types.
3439 SmallVector<MVT, 2> RegParmTypes;
3440 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3441 RegParmTypes.push_back(IntVT);
3442 if (VecVT != MVT::Other)
3443 RegParmTypes.push_back(VecVT);
3445 // Compute the set of forwarded registers. The rest are scratch.
3446 SmallVectorImpl<ForwardedRegister> &Forwards =
3447 FuncInfo->getForwardedMustTailRegParms();
3448 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3450 // Conservatively forward AL on x86_64, since it might be used for varargs.
3451 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3452 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3453 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3456 // Copy all forwards from physical to virtual registers.
3457 for (ForwardedRegister &F : Forwards) {
3458 // FIXME: Can we use a less constrained schedule?
3459 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3460 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3461 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3465 // Some CCs need callee pop.
3466 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3467 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3468 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3469 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3470 // X86 interrupts must pop the error code (and the alignment padding) if
3472 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3474 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3475 // If this is an sret function, the return should pop the hidden pointer.
3476 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3477 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3478 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3479 FuncInfo->setBytesToPopOnReturn(4);
3483 // RegSaveFrameIndex is X86-64 only.
3484 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3485 if (CallConv == CallingConv::X86_FastCall ||
3486 CallConv == CallingConv::X86_ThisCall)
3487 // fastcc functions can't have varargs.
3488 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3491 FuncInfo->setArgumentStackSize(StackSize);
3493 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3494 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3495 if (Personality == EHPersonality::CoreCLR) {
3497 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3498 // that we'd prefer this slot be allocated towards the bottom of the frame
3499 // (i.e. near the stack pointer after allocating the frame). Every
3500 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3501 // offset from the bottom of this and each funclet's frame must be the
3502 // same, so the size of funclets' (mostly empty) frames is dictated by
3503 // how far this slot is from the bottom (since they allocate just enough
3504 // space to accommodate holding this slot at the correct offset).
3505 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3506 EHInfo->PSPSymFrameIdx = PSPSymFI;
3510 if (CallConv == CallingConv::X86_RegCall ||
3511 F.hasFnAttribute("no_caller_saved_registers")) {
3512 MachineRegisterInfo &MRI = MF.getRegInfo();
3513 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3514 MRI.disableCalleeSavedRegister(Pair.first);
3520 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3521 SDValue Arg, const SDLoc &dl,
3523 const CCValAssign &VA,
3524 ISD::ArgFlagsTy Flags) const {
3525 unsigned LocMemOffset = VA.getLocMemOffset();
3526 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3527 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3529 if (Flags.isByVal())
3530 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3532 return DAG.getStore(
3533 Chain, dl, Arg, PtrOff,
3534 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3537 /// Emit a load of return address if tail call
3538 /// optimization is performed and it is required.
3539 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3540 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3541 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3542 // Adjust the Return address stack slot.
3543 EVT VT = getPointerTy(DAG.getDataLayout());
3544 OutRetAddr = getReturnAddressFrameIndex(DAG);
3546 // Load the "old" Return address.
3547 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3548 return SDValue(OutRetAddr.getNode(), 1);
3551 /// Emit a store of the return address if tail call
3552 /// optimization is performed and it is required (FPDiff!=0).
3553 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3554 SDValue Chain, SDValue RetAddrFrIdx,
3555 EVT PtrVT, unsigned SlotSize,
3556 int FPDiff, const SDLoc &dl) {
3557 // Store the return address to the appropriate stack slot.
3558 if (!FPDiff) return Chain;
3559 // Calculate the new stack slot for the return address.
3560 int NewReturnAddrFI =
3561 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3563 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3564 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3565 MachinePointerInfo::getFixedStack(
3566 DAG.getMachineFunction(), NewReturnAddrFI));
3570 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3571 /// operation of specified width.
3572 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3574 unsigned NumElems = VT.getVectorNumElements();
3575 SmallVector<int, 8> Mask;
3576 Mask.push_back(NumElems);
3577 for (unsigned i = 1; i != NumElems; ++i)
3579 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3583 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3584 SmallVectorImpl<SDValue> &InVals) const {
3585 SelectionDAG &DAG = CLI.DAG;
3587 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3588 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3589 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3590 SDValue Chain = CLI.Chain;
3591 SDValue Callee = CLI.Callee;
3592 CallingConv::ID CallConv = CLI.CallConv;
3593 bool &isTailCall = CLI.IsTailCall;
3594 bool isVarArg = CLI.IsVarArg;
3596 MachineFunction &MF = DAG.getMachineFunction();
3597 bool Is64Bit = Subtarget.is64Bit();
3598 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3599 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3600 bool IsSibcall = false;
3601 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3602 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3603 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3604 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3605 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3606 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3607 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3609 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3610 const Module *M = MF.getMMI().getModule();
3611 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3613 if (CallConv == CallingConv::X86_INTR)
3614 report_fatal_error("X86 interrupts may not be called directly");
3616 if (Attr.getValueAsString() == "true")
3619 if (Subtarget.isPICStyleGOT() &&
3620 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3621 // If we are using a GOT, disable tail calls to external symbols with
3622 // default visibility. Tail calling such a symbol requires using a GOT
3623 // relocation, which forces early binding of the symbol. This breaks code
3624 // that require lazy function symbol resolution. Using musttail or
3625 // GuaranteedTailCallOpt will override this.
3626 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3627 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3628 G->getGlobal()->hasDefaultVisibility()))
3632 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3634 // Force this to be a tail call. The verifier rules are enough to ensure
3635 // that we can lower this successfully without moving the return address
3638 } else if (isTailCall) {
3639 // Check if it's really possible to do a tail call.
3640 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3641 isVarArg, SR != NotStructReturn,
3642 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3643 Outs, OutVals, Ins, DAG);
3645 // Sibcalls are automatically detected tailcalls which do not require
3647 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3654 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3655 "Var args not supported with calling convention fastcc, ghc or hipe");
3657 // Analyze operands of the call, assigning locations to each operand.
3658 SmallVector<CCValAssign, 16> ArgLocs;
3659 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3661 // Allocate shadow area for Win64.
3663 CCInfo.AllocateStack(32, 8);
3665 CCInfo.AnalyzeArguments(Outs, CC_X86);
3667 // In vectorcall calling convention a second pass is required for the HVA
3669 if (CallingConv::X86_VectorCall == CallConv) {
3670 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3673 // Get a count of how many bytes are to be pushed on the stack.
3674 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3676 // This is a sibcall. The memory operands are available in caller's
3677 // own caller's stack.
3679 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3680 canGuaranteeTCO(CallConv))
3681 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3684 if (isTailCall && !IsSibcall && !IsMustTail) {
3685 // Lower arguments at fp - stackoffset + fpdiff.
3686 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3688 FPDiff = NumBytesCallerPushed - NumBytes;
3690 // Set the delta of movement of the returnaddr stackslot.
3691 // But only set if delta is greater than previous delta.
3692 if (FPDiff < X86Info->getTCReturnAddrDelta())
3693 X86Info->setTCReturnAddrDelta(FPDiff);
3696 unsigned NumBytesToPush = NumBytes;
3697 unsigned NumBytesToPop = NumBytes;
3699 // If we have an inalloca argument, all stack space has already been allocated
3700 // for us and be right at the top of the stack. We don't support multiple
3701 // arguments passed in memory when using inalloca.
3702 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3704 if (!ArgLocs.back().isMemLoc())
3705 report_fatal_error("cannot use inalloca attribute on a register "
3707 if (ArgLocs.back().getLocMemOffset() != 0)
3708 report_fatal_error("any parameter with the inalloca attribute must be "
3709 "the only memory argument");
3713 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3714 NumBytes - NumBytesToPush, dl);
3716 SDValue RetAddrFrIdx;
3717 // Load return address for tail calls.
3718 if (isTailCall && FPDiff)
3719 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3720 Is64Bit, FPDiff, dl);
3722 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3723 SmallVector<SDValue, 8> MemOpChains;
3726 // The next loop assumes that the locations are in the same order of the
3728 assert(isSortedByValueNo(ArgLocs) &&
3729 "Argument Location list must be sorted before lowering");
3731 // Walk the register/memloc assignments, inserting copies/loads. In the case
3732 // of tail call optimization arguments are handle later.
3733 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3734 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3736 assert(OutIndex < Outs.size() && "Invalid Out index");
3737 // Skip inalloca arguments, they have already been written.
3738 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3739 if (Flags.isInAlloca())
3742 CCValAssign &VA = ArgLocs[I];
3743 EVT RegVT = VA.getLocVT();
3744 SDValue Arg = OutVals[OutIndex];
3745 bool isByVal = Flags.isByVal();
3747 // Promote the value if needed.
3748 switch (VA.getLocInfo()) {
3749 default: llvm_unreachable("Unknown loc info!");
3750 case CCValAssign::Full: break;
3751 case CCValAssign::SExt:
3752 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3754 case CCValAssign::ZExt:
3755 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3757 case CCValAssign::AExt:
3758 if (Arg.getValueType().isVector() &&
3759 Arg.getValueType().getVectorElementType() == MVT::i1)
3760 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3761 else if (RegVT.is128BitVector()) {
3762 // Special case: passing MMX values in XMM registers.
3763 Arg = DAG.getBitcast(MVT::i64, Arg);
3764 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3765 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3767 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3769 case CCValAssign::BCvt:
3770 Arg = DAG.getBitcast(RegVT, Arg);
3772 case CCValAssign::Indirect: {
3774 // Memcpy the argument to a temporary stack slot to prevent
3775 // the caller from seeing any modifications the callee may make
3776 // as guaranteed by the `byval` attribute.
3777 int FrameIdx = MF.getFrameInfo().CreateStackObject(
3778 Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
3781 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
3783 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
3784 // From now on treat this as a regular pointer
3788 // Store the argument.
3789 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3790 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3791 Chain = DAG.getStore(
3792 Chain, dl, Arg, SpillSlot,
3793 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3800 if (VA.needsCustom()) {
3801 assert(VA.getValVT() == MVT::v64i1 &&
3802 "Currently the only custom case is when we split v64i1 to 2 regs");
3803 // Split v64i1 value into two registers
3804 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3806 } else if (VA.isRegLoc()) {
3807 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3808 if (isVarArg && IsWin64) {
3809 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3810 // shadow reg if callee is a varargs function.
3811 unsigned ShadowReg = 0;
3812 switch (VA.getLocReg()) {
3813 case X86::XMM0: ShadowReg = X86::RCX; break;
3814 case X86::XMM1: ShadowReg = X86::RDX; break;
3815 case X86::XMM2: ShadowReg = X86::R8; break;
3816 case X86::XMM3: ShadowReg = X86::R9; break;
3819 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3821 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3822 assert(VA.isMemLoc());
3823 if (!StackPtr.getNode())
3824 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3825 getPointerTy(DAG.getDataLayout()));
3826 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3827 dl, DAG, VA, Flags));
3831 if (!MemOpChains.empty())
3832 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3834 if (Subtarget.isPICStyleGOT()) {
3835 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3838 RegsToPass.push_back(std::make_pair(
3839 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3840 getPointerTy(DAG.getDataLayout()))));
3842 // If we are tail calling and generating PIC/GOT style code load the
3843 // address of the callee into ECX. The value in ecx is used as target of
3844 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3845 // for tail calls on PIC/GOT architectures. Normally we would just put the
3846 // address of GOT into ebx and then call target@PLT. But for tail calls
3847 // ebx would be restored (since ebx is callee saved) before jumping to the
3850 // Note: The actual moving to ECX is done further down.
3851 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3852 if (G && !G->getGlobal()->hasLocalLinkage() &&
3853 G->getGlobal()->hasDefaultVisibility())
3854 Callee = LowerGlobalAddress(Callee, DAG);
3855 else if (isa<ExternalSymbolSDNode>(Callee))
3856 Callee = LowerExternalSymbol(Callee, DAG);
3860 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3861 // From AMD64 ABI document:
3862 // For calls that may call functions that use varargs or stdargs
3863 // (prototype-less calls or calls to functions containing ellipsis (...) in
3864 // the declaration) %al is used as hidden argument to specify the number
3865 // of SSE registers used. The contents of %al do not need to match exactly
3866 // the number of registers, but must be an ubound on the number of SSE
3867 // registers used and is in the range 0 - 8 inclusive.
3869 // Count the number of XMM registers allocated.
3870 static const MCPhysReg XMMArgRegs[] = {
3871 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3872 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3874 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3875 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3876 && "SSE registers cannot be used when SSE is disabled");
3878 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3879 DAG.getConstant(NumXMMRegs, dl,
3883 if (isVarArg && IsMustTail) {
3884 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3885 for (const auto &F : Forwards) {
3886 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3887 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3891 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3892 // don't need this because the eligibility check rejects calls that require
3893 // shuffling arguments passed in memory.
3894 if (!IsSibcall && isTailCall) {
3895 // Force all the incoming stack arguments to be loaded from the stack
3896 // before any new outgoing arguments are stored to the stack, because the
3897 // outgoing stack slots may alias the incoming argument stack slots, and
3898 // the alias isn't otherwise explicit. This is slightly more conservative
3899 // than necessary, because it means that each store effectively depends
3900 // on every argument instead of just those arguments it would clobber.
3901 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3903 SmallVector<SDValue, 8> MemOpChains2;
3906 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3908 CCValAssign &VA = ArgLocs[I];
3910 if (VA.isRegLoc()) {
3911 if (VA.needsCustom()) {
3912 assert((CallConv == CallingConv::X86_RegCall) &&
3913 "Expecting custom case only in regcall calling convention");
3914 // This means that we are in special case where one argument was
3915 // passed through two register locations - Skip the next location
3922 assert(VA.isMemLoc());
3923 SDValue Arg = OutVals[OutsIndex];
3924 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3925 // Skip inalloca arguments. They don't require any work.
3926 if (Flags.isInAlloca())
3928 // Create frame index.
3929 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3930 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3931 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3932 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3934 if (Flags.isByVal()) {
3935 // Copy relative to framepointer.
3936 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3937 if (!StackPtr.getNode())
3938 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3939 getPointerTy(DAG.getDataLayout()));
3940 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3943 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3947 // Store relative to framepointer.
3948 MemOpChains2.push_back(DAG.getStore(
3949 ArgChain, dl, Arg, FIN,
3950 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3954 if (!MemOpChains2.empty())
3955 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3957 // Store the return address to the appropriate stack slot.
3958 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3959 getPointerTy(DAG.getDataLayout()),
3960 RegInfo->getSlotSize(), FPDiff, dl);
3963 // Build a sequence of copy-to-reg nodes chained together with token chain
3964 // and flag operands which copy the outgoing args into registers.
3966 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3967 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3968 RegsToPass[i].second, InFlag);
3969 InFlag = Chain.getValue(1);
3972 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3973 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3974 // In the 64-bit large code model, we have to make all calls
3975 // through a register, since the call instruction's 32-bit
3976 // pc-relative offset may not be large enough to hold the whole
3978 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3979 // If the callee is a GlobalAddress node (quite common, every direct call
3980 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3982 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3984 // We should use extra load for direct calls to dllimported functions in
3986 const GlobalValue *GV = G->getGlobal();
3987 if (!GV->hasDLLImportStorageClass()) {
3988 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3990 Callee = DAG.getTargetGlobalAddress(
3991 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3993 if (OpFlags == X86II::MO_GOTPCREL) {
3995 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3996 getPointerTy(DAG.getDataLayout()), Callee);
3997 // Add extra indirection
3998 Callee = DAG.getLoad(
3999 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
4000 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
4003 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
4004 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
4005 unsigned char OpFlags =
4006 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
4008 Callee = DAG.getTargetExternalSymbol(
4009 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
4011 if (OpFlags == X86II::MO_GOTPCREL) {
4012 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
4013 getPointerTy(DAG.getDataLayout()), Callee);
4014 Callee = DAG.getLoad(
4015 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
4016 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
4018 } else if (Subtarget.isTarget64BitILP32() &&
4019 Callee->getValueType(0) == MVT::i32) {
4020 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4021 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4024 // Returns a chain & a flag for retval copy to use.
4025 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4026 SmallVector<SDValue, 8> Ops;
4028 if (!IsSibcall && isTailCall) {
4029 Chain = DAG.getCALLSEQ_END(Chain,
4030 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4031 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4032 InFlag = Chain.getValue(1);
4035 Ops.push_back(Chain);
4036 Ops.push_back(Callee);
4039 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
4041 // Add argument registers to the end of the list so that they are known live
4043 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4044 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4045 RegsToPass[i].second.getValueType()));
4047 // Add a register mask operand representing the call-preserved registers.
4048 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
4049 // set X86_INTR calling convention because it has the same CSR mask
4050 // (same preserved registers).
4051 const uint32_t *Mask = RegInfo->getCallPreservedMask(
4052 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
4053 assert(Mask && "Missing call preserved mask for calling convention");
4055 // If this is an invoke in a 32-bit function using a funclet-based
4056 // personality, assume the function clobbers all registers. If an exception
4057 // is thrown, the runtime will not restore CSRs.
4058 // FIXME: Model this more precisely so that we can register allocate across
4059 // the normal edge and spill and fill across the exceptional edge.
4060 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
4061 const Function &CallerFn = MF.getFunction();
4062 EHPersonality Pers =
4063 CallerFn.hasPersonalityFn()
4064 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4065 : EHPersonality::Unknown;
4066 if (isFuncletEHPersonality(Pers))
4067 Mask = RegInfo->getNoPreservedMask();
4070 // Define a new register mask from the existing mask.
4071 uint32_t *RegMask = nullptr;
4073 // In some calling conventions we need to remove the used physical registers
4074 // from the reg mask.
4075 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4076 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4078 // Allocate a new Reg Mask and copy Mask.
4079 RegMask = MF.allocateRegMask();
4080 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4081 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4083 // Make sure all sub registers of the argument registers are reset
4085 for (auto const &RegPair : RegsToPass)
4086 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4087 SubRegs.isValid(); ++SubRegs)
4088 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4090 // Create the RegMask Operand according to our updated mask.
4091 Ops.push_back(DAG.getRegisterMask(RegMask));
4093 // Create the RegMask Operand according to the static mask.
4094 Ops.push_back(DAG.getRegisterMask(Mask));
4097 if (InFlag.getNode())
4098 Ops.push_back(InFlag);
4102 //// If this is the first return lowered for this function, add the regs
4103 //// to the liveout set for the function.
4104 // This isn't right, although it's probably harmless on x86; liveouts
4105 // should be computed from returns not tail calls. Consider a void
4106 // function making a tail call to a function returning int.
4107 MF.getFrameInfo().setHasTailCall();
4108 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4111 if (HasNoCfCheck && IsCFProtectionSupported) {
4112 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4114 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4116 InFlag = Chain.getValue(1);
4118 // Create the CALLSEQ_END node.
4119 unsigned NumBytesForCalleeToPop;
4120 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4121 DAG.getTarget().Options.GuaranteedTailCallOpt))
4122 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4123 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4124 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4125 SR == StackStructReturn)
4126 // If this is a call to a struct-return function, the callee
4127 // pops the hidden struct pointer, so we have to push it back.
4128 // This is common for Darwin/X86, Linux & Mingw32 targets.
4129 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4130 NumBytesForCalleeToPop = 4;
4132 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4134 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
4135 // No need to reset the stack after the call if the call doesn't return. To
4136 // make the MI verify, we'll pretend the callee does it for us.
4137 NumBytesForCalleeToPop = NumBytes;
4140 // Returns a flag for retval copy to use.
4142 Chain = DAG.getCALLSEQ_END(Chain,
4143 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4144 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4147 InFlag = Chain.getValue(1);
4150 // Handle result values, copying them out of physregs into vregs that we
4152 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4156 //===----------------------------------------------------------------------===//
4157 // Fast Calling Convention (tail call) implementation
4158 //===----------------------------------------------------------------------===//
4160 // Like std call, callee cleans arguments, convention except that ECX is
4161 // reserved for storing the tail called function address. Only 2 registers are
4162 // free for argument passing (inreg). Tail call optimization is performed
4164 // * tailcallopt is enabled
4165 // * caller/callee are fastcc
4166 // On X86_64 architecture with GOT-style position independent code only local
4167 // (within module) calls are supported at the moment.
4168 // To keep the stack aligned according to platform abi the function
4169 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
4170 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
4171 // If a tail called function callee has more arguments than the caller the
4172 // caller needs to make sure that there is room to move the RETADDR to. This is
4173 // achieved by reserving an area the size of the argument delta right after the
4174 // original RETADDR, but before the saved framepointer or the spilled registers
4175 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4187 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4190 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
4191 SelectionDAG& DAG) const {
4192 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4193 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
4194 unsigned StackAlignment = TFI.getStackAlignment();
4195 uint64_t AlignMask = StackAlignment - 1;
4196 int64_t Offset = StackSize;
4197 unsigned SlotSize = RegInfo->getSlotSize();
4198 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
4199 // Number smaller than 12 so just add the difference.
4200 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
4202 // Mask out lower bits, add stackalignment once plus the 12 bytes.
4203 Offset = ((~AlignMask) & Offset) + StackAlignment +
4204 (StackAlignment-SlotSize);
4209 /// Return true if the given stack call argument is already available in the
4210 /// same position (relatively) of the caller's incoming argument stack.
4212 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4213 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4214 const X86InstrInfo *TII, const CCValAssign &VA) {
4215 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4218 // Look through nodes that don't alter the bits of the incoming value.
4219 unsigned Op = Arg.getOpcode();
4220 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4221 Arg = Arg.getOperand(0);
4224 if (Op == ISD::TRUNCATE) {
4225 const SDValue &TruncInput = Arg.getOperand(0);
4226 if (TruncInput.getOpcode() == ISD::AssertZext &&
4227 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4228 Arg.getValueType()) {
4229 Arg = TruncInput.getOperand(0);
4237 if (Arg.getOpcode() == ISD::CopyFromReg) {
4238 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4239 if (!TargetRegisterInfo::isVirtualRegister(VR))
4241 MachineInstr *Def = MRI->getVRegDef(VR);
4244 if (!Flags.isByVal()) {
4245 if (!TII->isLoadFromStackSlot(*Def, FI))
4248 unsigned Opcode = Def->getOpcode();
4249 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4250 Opcode == X86::LEA64_32r) &&
4251 Def->getOperand(1).isFI()) {
4252 FI = Def->getOperand(1).getIndex();
4253 Bytes = Flags.getByValSize();
4257 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4258 if (Flags.isByVal())
4259 // ByVal argument is passed in as a pointer but it's now being
4260 // dereferenced. e.g.
4261 // define @foo(%struct.X* %A) {
4262 // tail call @bar(%struct.X* byval %A)
4265 SDValue Ptr = Ld->getBasePtr();
4266 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4269 FI = FINode->getIndex();
4270 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4271 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4272 FI = FINode->getIndex();
4273 Bytes = Flags.getByValSize();
4277 assert(FI != INT_MAX);
4278 if (!MFI.isFixedObjectIndex(FI))
4281 if (Offset != MFI.getObjectOffset(FI))
4284 // If this is not byval, check that the argument stack object is immutable.
4285 // inalloca and argument copy elision can create mutable argument stack
4286 // objects. Byval objects can be mutated, but a byval call intends to pass the
4288 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4291 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4292 // If the argument location is wider than the argument type, check that any
4293 // extension flags match.
4294 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4295 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4300 return Bytes == MFI.getObjectSize(FI);
4303 /// Check whether the call is eligible for tail call optimization. Targets
4304 /// that want to do tail call optimization should implement this function.
4305 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4306 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4307 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4308 const SmallVectorImpl<ISD::OutputArg> &Outs,
4309 const SmallVectorImpl<SDValue> &OutVals,
4310 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4311 if (!mayTailCallThisCC(CalleeCC))
4314 // If -tailcallopt is specified, make fastcc functions tail-callable.
4315 MachineFunction &MF = DAG.getMachineFunction();
4316 const Function &CallerF = MF.getFunction();
4318 // If the function return type is x86_fp80 and the callee return type is not,
4319 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4320 // perform a tailcall optimization here.
4321 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4324 CallingConv::ID CallerCC = CallerF.getCallingConv();
4325 bool CCMatch = CallerCC == CalleeCC;
4326 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4327 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4329 // Win64 functions have extra shadow space for argument homing. Don't do the
4330 // sibcall if the caller and callee have mismatched expectations for this
4332 if (IsCalleeWin64 != IsCallerWin64)
4335 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4336 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4341 // Look for obvious safe cases to perform tail call optimization that do not
4342 // require ABI changes. This is what gcc calls sibcall.
4344 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4345 // emit a special epilogue.
4346 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4347 if (RegInfo->needsStackRealignment(MF))
4350 // Also avoid sibcall optimization if either caller or callee uses struct
4351 // return semantics.
4352 if (isCalleeStructRet || isCallerStructRet)
4355 // Do not sibcall optimize vararg calls unless all arguments are passed via
4357 LLVMContext &C = *DAG.getContext();
4358 if (isVarArg && !Outs.empty()) {
4359 // Optimizing for varargs on Win64 is unlikely to be safe without
4360 // additional testing.
4361 if (IsCalleeWin64 || IsCallerWin64)
4364 SmallVector<CCValAssign, 16> ArgLocs;
4365 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4367 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4368 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4369 if (!ArgLocs[i].isRegLoc())
4373 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4374 // stack. Therefore, if it's not used by the call it is not safe to optimize
4375 // this into a sibcall.
4376 bool Unused = false;
4377 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4384 SmallVector<CCValAssign, 16> RVLocs;
4385 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4386 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4387 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4388 CCValAssign &VA = RVLocs[i];
4389 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4394 // Check that the call results are passed in the same way.
4395 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4396 RetCC_X86, RetCC_X86))
4398 // The callee has to preserve all registers the caller needs to preserve.
4399 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4400 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4402 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4403 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4407 unsigned StackArgsSize = 0;
4409 // If the callee takes no arguments then go on to check the results of the
4411 if (!Outs.empty()) {
4412 // Check if stack adjustment is needed. For now, do not do this if any
4413 // argument is passed on the stack.
4414 SmallVector<CCValAssign, 16> ArgLocs;
4415 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4417 // Allocate shadow area for Win64
4419 CCInfo.AllocateStack(32, 8);
4421 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4422 StackArgsSize = CCInfo.getNextStackOffset();
4424 if (CCInfo.getNextStackOffset()) {
4425 // Check if the arguments are already laid out in the right way as
4426 // the caller's fixed stack objects.
4427 MachineFrameInfo &MFI = MF.getFrameInfo();
4428 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4429 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4430 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4431 CCValAssign &VA = ArgLocs[i];
4432 SDValue Arg = OutVals[i];
4433 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4434 if (VA.getLocInfo() == CCValAssign::Indirect)
4436 if (!VA.isRegLoc()) {
4437 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4444 bool PositionIndependent = isPositionIndependent();
4445 // If the tailcall address may be in a register, then make sure it's
4446 // possible to register allocate for it. In 32-bit, the call address can
4447 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4448 // callee-saved registers are restored. These happen to be the same
4449 // registers used to pass 'inreg' arguments so watch out for those.
4450 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4451 !isa<ExternalSymbolSDNode>(Callee)) ||
4452 PositionIndependent)) {
4453 unsigned NumInRegs = 0;
4454 // In PIC we need an extra register to formulate the address computation
4456 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4458 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4459 CCValAssign &VA = ArgLocs[i];
4462 unsigned Reg = VA.getLocReg();
4465 case X86::EAX: case X86::EDX: case X86::ECX:
4466 if (++NumInRegs == MaxInRegs)
4473 const MachineRegisterInfo &MRI = MF.getRegInfo();
4474 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4478 bool CalleeWillPop =
4479 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4480 MF.getTarget().Options.GuaranteedTailCallOpt);
4482 if (unsigned BytesToPop =
4483 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4484 // If we have bytes to pop, the callee must pop them.
4485 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4486 if (!CalleePopMatches)
4488 } else if (CalleeWillPop && StackArgsSize > 0) {
4489 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4497 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4498 const TargetLibraryInfo *libInfo) const {
4499 return X86::createFastISel(funcInfo, libInfo);
4502 //===----------------------------------------------------------------------===//
4503 // Other Lowering Hooks
4504 //===----------------------------------------------------------------------===//
4506 static bool MayFoldLoad(SDValue Op) {
4507 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4510 static bool MayFoldIntoStore(SDValue Op) {
4511 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4514 static bool MayFoldIntoZeroExtend(SDValue Op) {
4515 if (Op.hasOneUse()) {
4516 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4517 return (ISD::ZERO_EXTEND == Opcode);
4522 static bool isTargetShuffle(unsigned Opcode) {
4524 default: return false;
4525 case X86ISD::BLENDI:
4526 case X86ISD::PSHUFB:
4527 case X86ISD::PSHUFD:
4528 case X86ISD::PSHUFHW:
4529 case X86ISD::PSHUFLW:
4531 case X86ISD::INSERTPS:
4532 case X86ISD::EXTRQI:
4533 case X86ISD::INSERTQI:
4534 case X86ISD::PALIGNR:
4535 case X86ISD::VSHLDQ:
4536 case X86ISD::VSRLDQ:
4537 case X86ISD::MOVLHPS:
4538 case X86ISD::MOVHLPS:
4539 case X86ISD::MOVSHDUP:
4540 case X86ISD::MOVSLDUP:
4541 case X86ISD::MOVDDUP:
4544 case X86ISD::UNPCKL:
4545 case X86ISD::UNPCKH:
4546 case X86ISD::VBROADCAST:
4547 case X86ISD::VPERMILPI:
4548 case X86ISD::VPERMILPV:
4549 case X86ISD::VPERM2X128:
4550 case X86ISD::SHUF128:
4551 case X86ISD::VPERMIL2:
4552 case X86ISD::VPERMI:
4553 case X86ISD::VPPERM:
4554 case X86ISD::VPERMV:
4555 case X86ISD::VPERMV3:
4556 case X86ISD::VZEXT_MOVL:
4561 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4563 default: return false;
4565 case X86ISD::PSHUFB:
4566 case X86ISD::VPERMILPV:
4567 case X86ISD::VPERMIL2:
4568 case X86ISD::VPPERM:
4569 case X86ISD::VPERMV:
4570 case X86ISD::VPERMV3:
4572 // 'Faux' Target Shuffles.
4580 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4581 MachineFunction &MF = DAG.getMachineFunction();
4582 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4583 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4584 int ReturnAddrIndex = FuncInfo->getRAIndex();
4586 if (ReturnAddrIndex == 0) {
4587 // Set up a frame object for the return address.
4588 unsigned SlotSize = RegInfo->getSlotSize();
4589 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4592 FuncInfo->setRAIndex(ReturnAddrIndex);
4595 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4598 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4599 bool hasSymbolicDisplacement) {
4600 // Offset should fit into 32 bit immediate field.
4601 if (!isInt<32>(Offset))
4604 // If we don't have a symbolic displacement - we don't have any extra
4606 if (!hasSymbolicDisplacement)
4609 // FIXME: Some tweaks might be needed for medium code model.
4610 if (M != CodeModel::Small && M != CodeModel::Kernel)
4613 // For small code model we assume that latest object is 16MB before end of 31
4614 // bits boundary. We may also accept pretty large negative constants knowing
4615 // that all objects are in the positive half of address space.
4616 if (M == CodeModel::Small && Offset < 16*1024*1024)
4619 // For kernel code model we know that all object resist in the negative half
4620 // of 32bits address space. We may not accept negative offsets, since they may
4621 // be just off and we may accept pretty large positive ones.
4622 if (M == CodeModel::Kernel && Offset >= 0)
4628 /// Determines whether the callee is required to pop its own arguments.
4629 /// Callee pop is necessary to support tail calls.
4630 bool X86::isCalleePop(CallingConv::ID CallingConv,
4631 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4632 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4633 // can guarantee TCO.
4634 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4637 switch (CallingConv) {
4640 case CallingConv::X86_StdCall:
4641 case CallingConv::X86_FastCall:
4642 case CallingConv::X86_ThisCall:
4643 case CallingConv::X86_VectorCall:
4648 /// Return true if the condition is an unsigned comparison operation.
4649 static bool isX86CCUnsigned(unsigned X86CC) {
4652 llvm_unreachable("Invalid integer condition!");
4668 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4669 switch (SetCCOpcode) {
4670 default: llvm_unreachable("Invalid integer condition!");
4671 case ISD::SETEQ: return X86::COND_E;
4672 case ISD::SETGT: return X86::COND_G;
4673 case ISD::SETGE: return X86::COND_GE;
4674 case ISD::SETLT: return X86::COND_L;
4675 case ISD::SETLE: return X86::COND_LE;
4676 case ISD::SETNE: return X86::COND_NE;
4677 case ISD::SETULT: return X86::COND_B;
4678 case ISD::SETUGT: return X86::COND_A;
4679 case ISD::SETULE: return X86::COND_BE;
4680 case ISD::SETUGE: return X86::COND_AE;
4684 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4685 /// condition code, returning the condition code and the LHS/RHS of the
4686 /// comparison to make.
4687 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4688 bool isFP, SDValue &LHS, SDValue &RHS,
4689 SelectionDAG &DAG) {
4691 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4692 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4693 // X > -1 -> X == 0, jump !sign.
4694 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4695 return X86::COND_NS;
4697 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4698 // X < 0 -> X == 0, jump on sign.
4701 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4703 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4704 return X86::COND_LE;
4708 return TranslateIntegerX86CC(SetCCOpcode);
4711 // First determine if it is required or is profitable to flip the operands.
4713 // If LHS is a foldable load, but RHS is not, flip the condition.
4714 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4715 !ISD::isNON_EXTLoad(RHS.getNode())) {
4716 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4717 std::swap(LHS, RHS);
4720 switch (SetCCOpcode) {
4726 std::swap(LHS, RHS);
4730 // On a floating point condition, the flags are set as follows:
4732 // 0 | 0 | 0 | X > Y
4733 // 0 | 0 | 1 | X < Y
4734 // 1 | 0 | 0 | X == Y
4735 // 1 | 1 | 1 | unordered
4736 switch (SetCCOpcode) {
4737 default: llvm_unreachable("Condcode should be pre-legalized away");
4739 case ISD::SETEQ: return X86::COND_E;
4740 case ISD::SETOLT: // flipped
4742 case ISD::SETGT: return X86::COND_A;
4743 case ISD::SETOLE: // flipped
4745 case ISD::SETGE: return X86::COND_AE;
4746 case ISD::SETUGT: // flipped
4748 case ISD::SETLT: return X86::COND_B;
4749 case ISD::SETUGE: // flipped
4751 case ISD::SETLE: return X86::COND_BE;
4753 case ISD::SETNE: return X86::COND_NE;
4754 case ISD::SETUO: return X86::COND_P;
4755 case ISD::SETO: return X86::COND_NP;
4757 case ISD::SETUNE: return X86::COND_INVALID;
4761 /// Is there a floating point cmov for the specific X86 condition code?
4762 /// Current x86 isa includes the following FP cmov instructions:
4763 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4764 static bool hasFPCMov(unsigned X86CC) {
4781 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4783 MachineFunction &MF,
4784 unsigned Intrinsic) const {
4786 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4790 Info.opc = ISD::INTRINSIC_W_CHAIN;
4791 Info.flags = MachineMemOperand::MONone;
4794 switch (IntrData->Type) {
4795 case TRUNCATE_TO_MEM_VI8:
4796 case TRUNCATE_TO_MEM_VI16:
4797 case TRUNCATE_TO_MEM_VI32: {
4798 Info.ptrVal = I.getArgOperand(0);
4799 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4800 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4801 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4803 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4804 ScalarVT = MVT::i16;
4805 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4806 ScalarVT = MVT::i32;
4808 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4810 Info.flags |= MachineMemOperand::MOStore;
4820 /// Returns true if the target can instruction select the
4821 /// specified FP immediate natively. If false, the legalizer will
4822 /// materialize the FP immediate as a load from a constant pool.
4823 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4824 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4825 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4831 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4832 ISD::LoadExtType ExtTy,
4834 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4835 // relocation target a movq or addq instruction: don't let the load shrink.
4836 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4837 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4838 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4839 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4843 /// Returns true if it is beneficial to convert a load of a constant
4844 /// to just the constant itself.
4845 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4847 assert(Ty->isIntegerTy());
4849 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4850 if (BitSize == 0 || BitSize > 64)
4855 bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
4856 // If we are using XMM registers in the ABI and the condition of the select is
4857 // a floating-point compare and we have blendv or conditional move, then it is
4858 // cheaper to select instead of doing a cross-register move and creating a
4859 // load that depends on the compare result.
4860 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
4863 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4864 // TODO: It might be a win to ease or lift this restriction, but the generic
4865 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4866 if (VT.isVector() && Subtarget.hasAVX512())
4872 bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
4873 // TODO: We handle scalars using custom code, but generic combining could make
4874 // that unnecessary.
4876 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
4879 // If vector multiply is legal, assume that's faster than shl + add/sub.
4880 // TODO: Multiply is a complex op with higher latency and lower througput in
4881 // most implementations, so this check could be loosened based on type
4882 // and/or a CPU attribute.
4883 if (isOperationLegal(ISD::MUL, VT))
4886 // shl+add, shl+sub, shl+add+neg
4887 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
4888 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
4891 bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
4892 bool IsSigned) const {
4893 // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
4894 return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
4897 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4898 unsigned Index) const {
4899 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4902 // Mask vectors support all subregister combinations and operations that
4903 // extract half of vector.
4904 if (ResVT.getVectorElementType() == MVT::i1)
4905 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4906 (Index == ResVT.getVectorNumElements()));
4908 return (Index % ResVT.getVectorNumElements()) == 0;
4911 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
4912 // If the vector op is not supported, try to convert to scalar.
4913 EVT VecVT = VecOp.getValueType();
4914 if (!isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), VecVT))
4917 // If the vector op is supported, but the scalar op is not, the transform may
4918 // not be worthwhile.
4919 EVT ScalarVT = VecVT.getScalarType();
4920 return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT);
4923 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4924 // Speculate cttz only if we can directly use TZCNT.
4925 return Subtarget.hasBMI();
4928 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4929 // Speculate ctlz only if we can directly use LZCNT.
4930 return Subtarget.hasLZCNT();
4933 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4934 EVT BitcastVT) const {
4935 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
4936 BitcastVT.getVectorElementType() == MVT::i1)
4939 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
4942 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4945 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4946 const SelectionDAG &DAG) const {
4947 // Do not merge to float value size (128 bytes) if no implicit
4948 // float attribute is set.
4949 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4950 Attribute::NoImplicitFloat);
4953 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4954 return (MemVT.getSizeInBits() <= MaxIntSize);
4959 bool X86TargetLowering::isCtlzFast() const {
4960 return Subtarget.hasFastLZCNT();
4963 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4964 const Instruction &AndI) const {
4968 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4969 EVT VT = Y.getValueType();
4974 if (!Subtarget.hasBMI())
4977 // There are only 32-bit and 64-bit forms for 'andn'.
4978 if (VT != MVT::i32 && VT != MVT::i64)
4981 return !isa<ConstantSDNode>(Y);
4984 bool X86TargetLowering::hasAndNot(SDValue Y) const {
4985 EVT VT = Y.getValueType();
4988 return hasAndNotCompare(Y);
4992 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
4995 if (VT == MVT::v4i32)
4998 return Subtarget.hasSSE2();
5001 bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
5002 EVT VT = Y.getValueType();
5004 // For vectors, we don't have a preference, but we probably want a mask.
5008 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5009 if (VT == MVT::i64 && !Subtarget.is64Bit())
5015 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5016 // Any legal vector type can be splatted more efficiently than
5017 // loading/spilling from memory.
5018 return isTypeLegal(VT);
5021 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5022 MVT VT = MVT::getIntegerVT(NumBits);
5023 if (isTypeLegal(VT))
5026 // PMOVMSKB can handle this.
5027 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5030 // VPMOVMSKB can handle this.
5031 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5034 // TODO: Allow 64-bit type for 32-bit target.
5035 // TODO: 512-bit types should be allowed, but make sure that those
5036 // cases are handled in combineVectorSizedSetCCEquality().
5038 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5041 /// Val is the undef sentinel value or equal to the specified value.
5042 static bool isUndefOrEqual(int Val, int CmpVal) {
5043 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5046 /// Val is either the undef or zero sentinel value.
5047 static bool isUndefOrZero(int Val) {
5048 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5051 /// Return true if every element in Mask, beginning
5052 /// from position Pos and ending in Pos+Size is the undef sentinel value.
5053 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5054 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
5055 if (Mask[i] != SM_SentinelUndef)
5060 /// Return true if Val falls within the specified range (L, H].
5061 static bool isInRange(int Val, int Low, int Hi) {
5062 return (Val >= Low && Val < Hi);
5065 /// Return true if the value of any element in Mask falls within the specified
5067 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5069 if (isInRange(M, Low, Hi))
5074 /// Return true if Val is undef or if its value falls within the
5075 /// specified range (L, H].
5076 static bool isUndefOrInRange(int Val, int Low, int Hi) {
5077 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5080 /// Return true if every element in Mask is undef or if its value
5081 /// falls within the specified range (L, H].
5082 static bool isUndefOrInRange(ArrayRef<int> Mask,
5085 if (!isUndefOrInRange(M, Low, Hi))
5090 /// Return true if Val is undef, zero or if its value falls within the
5091 /// specified range (L, H].
5092 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5093 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5096 /// Return true if every element in Mask is undef, zero or if its value
5097 /// falls within the specified range (L, H].
5098 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5100 if (!isUndefOrZeroOrInRange(M, Low, Hi))
5105 /// Return true if every element in Mask, beginning
5106 /// from position Pos and ending in Pos + Size, falls within the specified
5107 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5108 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5109 unsigned Size, int Low, int Step = 1) {
5110 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5111 if (!isUndefOrEqual(Mask[i], Low))
5116 /// Return true if every element in Mask, beginning
5117 /// from position Pos and ending in Pos+Size, falls within the specified
5118 /// sequential range (Low, Low+Size], or is undef or is zero.
5119 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5120 unsigned Size, int Low) {
5121 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
5122 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5127 /// Return true if every element in Mask, beginning
5128 /// from position Pos and ending in Pos+Size is undef or is zero.
5129 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5131 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
5132 if (!isUndefOrZero(Mask[i]))
5137 /// Helper function to test whether a shuffle mask could be
5138 /// simplified by widening the elements being shuffled.
5140 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5141 /// leaves it in an unspecified state.
5143 /// NOTE: This must handle normal vector shuffle masks and *target* vector
5144 /// shuffle masks. The latter have the special property of a '-2' representing
5145 /// a zero-ed lane of a vector.
5146 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5147 SmallVectorImpl<int> &WidenedMask) {
5148 WidenedMask.assign(Mask.size() / 2, 0);
5149 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5151 int M1 = Mask[i + 1];
5153 // If both elements are undef, its trivial.
5154 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5155 WidenedMask[i / 2] = SM_SentinelUndef;
5159 // Check for an undef mask and a mask value properly aligned to fit with
5160 // a pair of values. If we find such a case, use the non-undef mask's value.
5161 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5162 WidenedMask[i / 2] = M1 / 2;
5165 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5166 WidenedMask[i / 2] = M0 / 2;
5170 // When zeroing, we need to spread the zeroing across both lanes to widen.
5171 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5172 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5173 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5174 WidenedMask[i / 2] = SM_SentinelZero;
5180 // Finally check if the two mask values are adjacent and aligned with
5182 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5183 WidenedMask[i / 2] = M0 / 2;
5187 // Otherwise we can't safely widen the elements used in this shuffle.
5190 assert(WidenedMask.size() == Mask.size() / 2 &&
5191 "Incorrect size of mask after widening the elements!");
5196 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5197 const APInt &Zeroable,
5198 SmallVectorImpl<int> &WidenedMask) {
5199 SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
5200 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
5201 if (TargetMask[i] == SM_SentinelUndef)
5204 TargetMask[i] = SM_SentinelZero;
5206 return canWidenShuffleElements(TargetMask, WidenedMask);
5209 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5210 SmallVector<int, 32> WidenedMask;
5211 return canWidenShuffleElements(Mask, WidenedMask);
5214 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
5215 bool X86::isZeroNode(SDValue Elt) {
5216 return isNullConstant(Elt) || isNullFPConstant(Elt);
5219 // Build a vector of constants.
5220 // Use an UNDEF node if MaskElt == -1.
5221 // Split 64-bit constants in the 32-bit mode.
5222 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5223 const SDLoc &dl, bool IsMask = false) {
5225 SmallVector<SDValue, 32> Ops;
5228 MVT ConstVecVT = VT;
5229 unsigned NumElts = VT.getVectorNumElements();
5230 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5231 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5232 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5236 MVT EltVT = ConstVecVT.getVectorElementType();
5237 for (unsigned i = 0; i < NumElts; ++i) {
5238 bool IsUndef = Values[i] < 0 && IsMask;
5239 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5240 DAG.getConstant(Values[i], dl, EltVT);
5241 Ops.push_back(OpNode);
5243 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5244 DAG.getConstant(0, dl, EltVT));
5246 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5248 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5252 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5253 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5254 assert(Bits.size() == Undefs.getBitWidth() &&
5255 "Unequal constant and undef arrays");
5256 SmallVector<SDValue, 32> Ops;
5259 MVT ConstVecVT = VT;
5260 unsigned NumElts = VT.getVectorNumElements();
5261 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5262 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5263 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5267 MVT EltVT = ConstVecVT.getVectorElementType();
5268 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5270 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5273 const APInt &V = Bits[i];
5274 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5276 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5277 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5278 } else if (EltVT == MVT::f32) {
5279 APFloat FV(APFloat::IEEEsingle(), V);
5280 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5281 } else if (EltVT == MVT::f64) {
5282 APFloat FV(APFloat::IEEEdouble(), V);
5283 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5285 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5289 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5290 return DAG.getBitcast(VT, ConstsNode);
5293 /// Returns a vector of specified type with all zero elements.
5294 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5295 SelectionDAG &DAG, const SDLoc &dl) {
5296 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5297 VT.getVectorElementType() == MVT::i1) &&
5298 "Unexpected vector type");
5300 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5301 // type. This ensures they get CSE'd. But if the integer type is not
5302 // available, use a floating-point +0.0 instead.
5304 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5305 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5306 } else if (VT.getVectorElementType() == MVT::i1) {
5307 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5308 "Unexpected vector type");
5309 Vec = DAG.getConstant(0, dl, VT);
5311 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5312 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5314 return DAG.getBitcast(VT, Vec);
5317 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5318 const SDLoc &dl, unsigned vectorWidth) {
5319 EVT VT = Vec.getValueType();
5320 EVT ElVT = VT.getVectorElementType();
5321 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5322 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5323 VT.getVectorNumElements()/Factor);
5325 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5326 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5327 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5329 // This is the index of the first element of the vectorWidth-bit chunk
5330 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5331 IdxVal &= ~(ElemsPerChunk - 1);
5333 // If the input is a buildvector just emit a smaller one.
5334 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5335 return DAG.getBuildVector(ResultVT, dl,
5336 Vec->ops().slice(IdxVal, ElemsPerChunk));
5338 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5339 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5342 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5343 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5344 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5345 /// instructions or a simple subregister reference. Idx is an index in the
5346 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5347 /// lowering EXTRACT_VECTOR_ELT operations easier.
5348 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5349 SelectionDAG &DAG, const SDLoc &dl) {
5350 assert((Vec.getValueType().is256BitVector() ||
5351 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5352 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5355 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5356 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5357 SelectionDAG &DAG, const SDLoc &dl) {
5358 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5359 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5362 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5363 SelectionDAG &DAG, const SDLoc &dl,
5364 unsigned vectorWidth) {
5365 assert((vectorWidth == 128 || vectorWidth == 256) &&
5366 "Unsupported vector width");
5367 // Inserting UNDEF is Result
5370 EVT VT = Vec.getValueType();
5371 EVT ElVT = VT.getVectorElementType();
5372 EVT ResultVT = Result.getValueType();
5374 // Insert the relevant vectorWidth bits.
5375 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5376 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5378 // This is the index of the first element of the vectorWidth-bit chunk
5379 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5380 IdxVal &= ~(ElemsPerChunk - 1);
5382 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5383 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5386 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5387 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5388 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5389 /// simple superregister reference. Idx is an index in the 128 bits
5390 /// we want. It need not be aligned to a 128-bit boundary. That makes
5391 /// lowering INSERT_VECTOR_ELT operations easier.
5392 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5393 SelectionDAG &DAG, const SDLoc &dl) {
5394 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5395 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5398 /// Widen a vector to a larger size with the same scalar type, with the new
5399 /// elements either zero or undef.
5400 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5401 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5403 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
5404 Vec.getValueType().getScalarType() == VT.getScalarType() &&
5405 "Unsupported vector widening type");
5406 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5408 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5409 DAG.getIntPtrConstant(0, dl));
5412 // Helper for splitting operands of an operation to legal target size and
5413 // apply a function on each part.
5414 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5415 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5416 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5417 // The argument Builder is a function that will be applied on each split part:
5418 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5419 template <typename F>
5420 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5421 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5422 F Builder, bool CheckBWI = true) {
5423 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
5424 unsigned NumSubs = 1;
5425 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5426 (!CheckBWI && Subtarget.useAVX512Regs())) {
5427 if (VT.getSizeInBits() > 512) {
5428 NumSubs = VT.getSizeInBits() / 512;
5429 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
5431 } else if (Subtarget.hasAVX2()) {
5432 if (VT.getSizeInBits() > 256) {
5433 NumSubs = VT.getSizeInBits() / 256;
5434 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
5437 if (VT.getSizeInBits() > 128) {
5438 NumSubs = VT.getSizeInBits() / 128;
5439 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
5444 return Builder(DAG, DL, Ops);
5446 SmallVector<SDValue, 4> Subs;
5447 for (unsigned i = 0; i != NumSubs; ++i) {
5448 SmallVector<SDValue, 2> SubOps;
5449 for (SDValue Op : Ops) {
5450 EVT OpVT = Op.getValueType();
5451 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5452 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5453 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5455 Subs.push_back(Builder(DAG, DL, SubOps));
5457 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5460 // Return true if the instruction zeroes the unused upper part of the
5461 // destination and accepts mask.
5462 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5467 case X86ISD::CMPM_RND:
5473 /// Insert i1-subvector to i1-vector.
5474 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5475 const X86Subtarget &Subtarget) {
5478 SDValue Vec = Op.getOperand(0);
5479 SDValue SubVec = Op.getOperand(1);
5480 SDValue Idx = Op.getOperand(2);
5482 if (!isa<ConstantSDNode>(Idx))
5485 // Inserting undef is a nop. We can just return the original vector.
5486 if (SubVec.isUndef())
5489 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5490 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5493 MVT OpVT = Op.getSimpleValueType();
5494 unsigned NumElems = OpVT.getVectorNumElements();
5496 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5498 // Extend to natively supported kshift.
5499 MVT WideOpVT = OpVT;
5500 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5501 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5503 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5505 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5506 // May need to promote to a legal type.
5507 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5508 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5510 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5513 MVT SubVecVT = SubVec.getSimpleValueType();
5514 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5516 assert(IdxVal + SubVecNumElems <= NumElems &&
5517 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5518 "Unexpected index value in INSERT_SUBVECTOR");
5520 SDValue Undef = DAG.getUNDEF(WideOpVT);
5523 // Zero lower bits of the Vec
5524 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5525 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5527 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5528 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5529 // Merge them together, SubVec should be zero extended.
5530 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5531 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5533 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5534 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5537 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5538 Undef, SubVec, ZeroIdx);
5540 if (Vec.isUndef()) {
5541 assert(IdxVal != 0 && "Unexpected index");
5542 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5543 DAG.getConstant(IdxVal, dl, MVT::i8));
5544 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5547 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5548 assert(IdxVal != 0 && "Unexpected index");
5549 NumElems = WideOpVT.getVectorNumElements();
5550 unsigned ShiftLeft = NumElems - SubVecNumElems;
5551 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5552 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5553 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5554 if (ShiftRight != 0)
5555 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5556 DAG.getConstant(ShiftRight, dl, MVT::i8));
5557 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5560 // Simple case when we put subvector in the upper part
5561 if (IdxVal + SubVecNumElems == NumElems) {
5562 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5563 DAG.getConstant(IdxVal, dl, MVT::i8));
5564 if (SubVecNumElems * 2 == NumElems) {
5565 // Special case, use legal zero extending insert_subvector. This allows
5566 // isel to opimitize when bits are known zero.
5567 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5568 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5569 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5572 // Otherwise use explicit shifts to zero the bits.
5573 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5574 Undef, Vec, ZeroIdx);
5575 NumElems = WideOpVT.getVectorNumElements();
5576 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5577 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5578 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5580 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5581 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5584 // Inserting into the middle is more complicated.
5586 NumElems = WideOpVT.getVectorNumElements();
5588 // Widen the vector if needed.
5589 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5590 // Move the current value of the bit to be replace to the lsbs.
5591 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5592 DAG.getConstant(IdxVal, dl, MVT::i8));
5593 // Xor with the new bit.
5594 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5595 // Shift to MSB, filling bottom bits with 0.
5596 unsigned ShiftLeft = NumElems - SubVecNumElems;
5597 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5598 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5599 // Shift to the final position, filling upper bits with 0.
5600 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5601 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5602 DAG.getConstant(ShiftRight, dl, MVT::i8));
5603 // Xor with original vector leaving the new value.
5604 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5605 // Reduce to original width if needed.
5606 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5609 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
5610 unsigned NumElems, SelectionDAG &DAG,
5611 const SDLoc &dl, unsigned VectorWidth) {
5612 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
5613 return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
5616 /// Returns a vector of specified type with all bits set.
5617 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5618 /// Then bitcast to their original type, ensuring they get CSE'd.
5619 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5620 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5621 "Expected a 128/256/512-bit vector type");
5623 APInt Ones = APInt::getAllOnesValue(32);
5624 unsigned NumElts = VT.getSizeInBits() / 32;
5625 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5626 return DAG.getBitcast(VT, Vec);
5629 static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
5630 SelectionDAG &DAG) {
5631 EVT InVT = In.getValueType();
5632 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
5634 // For 256-bit vectors, we only need the lower (128-bit) input half.
5635 // For 512-bit vectors, we only need the lower input half or quarter.
5636 if (InVT.getSizeInBits() > 128) {
5637 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
5638 "Expected VTs to be the same size!");
5639 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5640 In = extractSubVector(In, 0, DAG, DL,
5641 std::max(128U, VT.getSizeInBits() / Scale));
5642 InVT = In.getValueType();
5645 if (VT.getVectorNumElements() == InVT.getVectorNumElements())
5646 return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
5649 return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG
5650 : ISD::ZERO_EXTEND_VECTOR_INREG,
5654 /// Returns a vector_shuffle node for an unpackl operation.
5655 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5656 SDValue V1, SDValue V2) {
5657 SmallVector<int, 8> Mask;
5658 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5659 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5662 /// Returns a vector_shuffle node for an unpackh operation.
5663 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5664 SDValue V1, SDValue V2) {
5665 SmallVector<int, 8> Mask;
5666 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5667 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5670 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5671 /// This produces a shuffle where the low element of V2 is swizzled into the
5672 /// zero/undef vector, landing at element Idx.
5673 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5674 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5676 const X86Subtarget &Subtarget,
5677 SelectionDAG &DAG) {
5678 MVT VT = V2.getSimpleValueType();
5680 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5681 int NumElems = VT.getVectorNumElements();
5682 SmallVector<int, 16> MaskVec(NumElems);
5683 for (int i = 0; i != NumElems; ++i)
5684 // If this is the insertion idx, put the low elt of V2 here.
5685 MaskVec[i] = (i == Idx) ? NumElems : i;
5686 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5689 // Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
5690 static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
5691 while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
5692 V = V.getOperand(0);
5696 static const Constant *getTargetConstantFromNode(SDValue Op) {
5697 Op = peekThroughBitcasts(Op);
5699 auto *Load = dyn_cast<LoadSDNode>(Op);
5703 SDValue Ptr = Load->getBasePtr();
5704 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5705 Ptr->getOpcode() == X86ISD::WrapperRIP)
5706 Ptr = Ptr->getOperand(0);
5708 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5709 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
5712 return CNode->getConstVal();
5715 // Extract raw constant bits from constant pools.
5716 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5718 SmallVectorImpl<APInt> &EltBits,
5719 bool AllowWholeUndefs = true,
5720 bool AllowPartialUndefs = true) {
5721 assert(EltBits.empty() && "Expected an empty EltBits vector");
5723 Op = peekThroughBitcasts(Op);
5725 EVT VT = Op.getValueType();
5726 unsigned SizeInBits = VT.getSizeInBits();
5727 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5728 unsigned NumElts = SizeInBits / EltSizeInBits;
5730 // Bitcast a source array of element bits to the target size.
5731 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5732 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5733 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5734 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5735 "Constant bit sizes don't match");
5737 // Don't split if we don't allow undef bits.
5738 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5739 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5742 // If we're already the right size, don't bother bitcasting.
5743 if (NumSrcElts == NumElts) {
5744 UndefElts = UndefSrcElts;
5745 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5749 // Extract all the undef/constant element data and pack into single bitsets.
5750 APInt UndefBits(SizeInBits, 0);
5751 APInt MaskBits(SizeInBits, 0);
5753 for (unsigned i = 0; i != NumSrcElts; ++i) {
5754 unsigned BitOffset = i * SrcEltSizeInBits;
5755 if (UndefSrcElts[i])
5756 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5757 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5760 // Split the undef/constant single bitset data into the target elements.
5761 UndefElts = APInt(NumElts, 0);
5762 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5764 for (unsigned i = 0; i != NumElts; ++i) {
5765 unsigned BitOffset = i * EltSizeInBits;
5766 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5768 // Only treat an element as UNDEF if all bits are UNDEF.
5769 if (UndefEltBits.isAllOnesValue()) {
5770 if (!AllowWholeUndefs)
5772 UndefElts.setBit(i);
5776 // If only some bits are UNDEF then treat them as zero (or bail if not
5778 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5781 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5782 EltBits[i] = Bits.getZExtValue();
5787 // Collect constant bits and insert into mask/undef bit masks.
5788 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5789 unsigned UndefBitIndex) {
5792 if (isa<UndefValue>(Cst)) {
5793 Undefs.setBit(UndefBitIndex);
5796 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5797 Mask = CInt->getValue();
5800 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5801 Mask = CFP->getValueAPF().bitcastToAPInt();
5809 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5810 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5811 return CastBitData(UndefSrcElts, SrcEltBits);
5814 // Extract scalar constant bits.
5815 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5816 APInt UndefSrcElts = APInt::getNullValue(1);
5817 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5818 return CastBitData(UndefSrcElts, SrcEltBits);
5820 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5821 APInt UndefSrcElts = APInt::getNullValue(1);
5822 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5823 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5824 return CastBitData(UndefSrcElts, SrcEltBits);
5827 // Extract constant bits from build vector.
5828 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5829 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5830 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5832 APInt UndefSrcElts(NumSrcElts, 0);
5833 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5834 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5835 const SDValue &Src = Op.getOperand(i);
5836 if (Src.isUndef()) {
5837 UndefSrcElts.setBit(i);
5840 auto *Cst = cast<ConstantSDNode>(Src);
5841 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5843 return CastBitData(UndefSrcElts, SrcEltBits);
5845 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
5846 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5847 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5849 APInt UndefSrcElts(NumSrcElts, 0);
5850 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5851 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5852 const SDValue &Src = Op.getOperand(i);
5853 if (Src.isUndef()) {
5854 UndefSrcElts.setBit(i);
5857 auto *Cst = cast<ConstantFPSDNode>(Src);
5858 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5859 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
5861 return CastBitData(UndefSrcElts, SrcEltBits);
5864 // Extract constant bits from constant pool vector.
5865 if (auto *Cst = getTargetConstantFromNode(Op)) {
5866 Type *CstTy = Cst->getType();
5867 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5868 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5871 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5872 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5874 APInt UndefSrcElts(NumSrcElts, 0);
5875 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5876 for (unsigned i = 0; i != NumSrcElts; ++i)
5877 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5881 return CastBitData(UndefSrcElts, SrcEltBits);
5884 // Extract constant bits from a broadcasted constant pool scalar.
5885 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5886 EltSizeInBits <= VT.getScalarSizeInBits()) {
5887 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5888 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5889 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5891 APInt UndefSrcElts(NumSrcElts, 0);
5892 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5893 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5894 if (UndefSrcElts[0])
5895 UndefSrcElts.setBits(0, NumSrcElts);
5896 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5897 return CastBitData(UndefSrcElts, SrcEltBits);
5902 // Extract a rematerialized scalar constant insertion.
5903 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5904 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5905 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5906 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5907 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5909 APInt UndefSrcElts(NumSrcElts, 0);
5910 SmallVector<APInt, 64> SrcEltBits;
5911 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5912 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5913 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5914 return CastBitData(UndefSrcElts, SrcEltBits);
5917 // Extract constant bits from a subvector's source.
5918 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5919 isa<ConstantSDNode>(Op.getOperand(1))) {
5920 // TODO - support extract_subvector through bitcasts.
5921 if (EltSizeInBits != VT.getScalarSizeInBits())
5924 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5925 UndefElts, EltBits, AllowWholeUndefs,
5926 AllowPartialUndefs)) {
5927 EVT SrcVT = Op.getOperand(0).getValueType();
5928 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5929 unsigned NumSubElts = VT.getVectorNumElements();
5930 unsigned BaseIdx = Op.getConstantOperandVal(1);
5931 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5932 if ((BaseIdx + NumSubElts) != NumSrcElts)
5933 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5935 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5940 // Extract constant bits from shuffle node sources.
5941 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5942 // TODO - support shuffle through bitcasts.
5943 if (EltSizeInBits != VT.getScalarSizeInBits())
5946 ArrayRef<int> Mask = SVN->getMask();
5947 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5948 llvm::any_of(Mask, [](int M) { return M < 0; }))
5951 APInt UndefElts0, UndefElts1;
5952 SmallVector<APInt, 32> EltBits0, EltBits1;
5953 if (isAnyInRange(Mask, 0, NumElts) &&
5954 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5955 UndefElts0, EltBits0, AllowWholeUndefs,
5956 AllowPartialUndefs))
5958 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5959 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5960 UndefElts1, EltBits1, AllowWholeUndefs,
5961 AllowPartialUndefs))
5964 UndefElts = APInt::getNullValue(NumElts);
5965 for (int i = 0; i != (int)NumElts; ++i) {
5968 UndefElts.setBit(i);
5969 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
5970 } else if (M < (int)NumElts) {
5972 UndefElts.setBit(i);
5973 EltBits.push_back(EltBits0[M]);
5975 if (UndefElts1[M - NumElts])
5976 UndefElts.setBit(i);
5977 EltBits.push_back(EltBits1[M - NumElts]);
5986 static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
5988 SmallVector<APInt, 16> EltBits;
5989 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
5990 UndefElts, EltBits, true, false)) {
5991 int SplatIndex = -1;
5992 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5995 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6001 if (0 <= SplatIndex) {
6002 SplatVal = EltBits[SplatIndex];
6010 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6011 unsigned MaskEltSizeInBits,
6012 SmallVectorImpl<uint64_t> &RawMask,
6014 // Extract the raw target constant bits.
6015 SmallVector<APInt, 64> EltBits;
6016 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6017 EltBits, /* AllowWholeUndefs */ true,
6018 /* AllowPartialUndefs */ false))
6021 // Insert the extracted elements into the mask.
6022 for (APInt Elt : EltBits)
6023 RawMask.push_back(Elt.getZExtValue());
6028 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6029 /// Note: This ignores saturation, so inputs must be checked first.
6030 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6032 assert(Mask.empty() && "Expected an empty shuffle mask vector");
6033 unsigned NumElts = VT.getVectorNumElements();
6034 unsigned NumLanes = VT.getSizeInBits() / 128;
6035 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6036 unsigned Offset = Unary ? 0 : NumElts;
6038 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6039 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
6040 Mask.push_back(Elt + (Lane * NumEltsPerLane));
6041 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
6042 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6046 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
6047 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6048 APInt &DemandedLHS, APInt &DemandedRHS) {
6049 int NumLanes = VT.getSizeInBits() / 128;
6050 int NumElts = DemandedElts.getBitWidth();
6051 int NumInnerElts = NumElts / 2;
6052 int NumEltsPerLane = NumElts / NumLanes;
6053 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6055 DemandedLHS = APInt::getNullValue(NumInnerElts);
6056 DemandedRHS = APInt::getNullValue(NumInnerElts);
6058 // Map DemandedElts to the packed operands.
6059 for (int Lane = 0; Lane != NumLanes; ++Lane) {
6060 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6061 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6062 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6063 if (DemandedElts[OuterIdx])
6064 DemandedLHS.setBit(InnerIdx);
6065 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6066 DemandedRHS.setBit(InnerIdx);
6071 /// Calculates the shuffle mask corresponding to the target-specific opcode.
6072 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6073 /// operands in \p Ops, and returns true.
6074 /// Sets \p IsUnary to true if only one source is used. Note that this will set
6075 /// IsUnary for shuffles which use a single input multiple times, and in those
6076 /// cases it will adjust the mask to only have indices within that single input.
6077 /// It is an error to call this with non-empty Mask/Ops vectors.
6078 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6079 SmallVectorImpl<SDValue> &Ops,
6080 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6081 unsigned NumElems = VT.getVectorNumElements();
6082 unsigned MaskEltSize = VT.getScalarSizeInBits();
6083 SmallVector<uint64_t, 32> RawMask;
6087 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
6088 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
6091 bool IsFakeUnary = false;
6092 switch (N->getOpcode()) {
6093 case X86ISD::BLENDI:
6094 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6095 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6096 ImmN = N->getOperand(N->getNumOperands() - 1);
6097 DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6098 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6101 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6102 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6103 ImmN = N->getOperand(N->getNumOperands() - 1);
6104 DecodeSHUFPMask(NumElems, MaskEltSize,
6105 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6106 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6108 case X86ISD::INSERTPS:
6109 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6110 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6111 ImmN = N->getOperand(N->getNumOperands() - 1);
6112 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6113 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6115 case X86ISD::EXTRQI:
6116 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6117 if (isa<ConstantSDNode>(N->getOperand(1)) &&
6118 isa<ConstantSDNode>(N->getOperand(2))) {
6119 int BitLen = N->getConstantOperandVal(1);
6120 int BitIdx = N->getConstantOperandVal(2);
6121 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6125 case X86ISD::INSERTQI:
6126 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6127 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6128 if (isa<ConstantSDNode>(N->getOperand(2)) &&
6129 isa<ConstantSDNode>(N->getOperand(3))) {
6130 int BitLen = N->getConstantOperandVal(2);
6131 int BitIdx = N->getConstantOperandVal(3);
6132 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6133 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6136 case X86ISD::UNPCKH:
6137 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6138 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6139 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
6140 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6142 case X86ISD::UNPCKL:
6143 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6144 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6145 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
6146 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6148 case X86ISD::MOVHLPS:
6149 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6150 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6151 DecodeMOVHLPSMask(NumElems, Mask);
6152 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6154 case X86ISD::MOVLHPS:
6155 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6156 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6157 DecodeMOVLHPSMask(NumElems, Mask);
6158 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6160 case X86ISD::PALIGNR:
6161 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6162 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6163 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6164 ImmN = N->getOperand(N->getNumOperands() - 1);
6165 DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6167 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6168 Ops.push_back(N->getOperand(1));
6169 Ops.push_back(N->getOperand(0));
6171 case X86ISD::VSHLDQ:
6172 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6173 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6174 ImmN = N->getOperand(N->getNumOperands() - 1);
6175 DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6179 case X86ISD::VSRLDQ:
6180 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6181 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6182 ImmN = N->getOperand(N->getNumOperands() - 1);
6183 DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6187 case X86ISD::PSHUFD:
6188 case X86ISD::VPERMILPI:
6189 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6190 ImmN = N->getOperand(N->getNumOperands() - 1);
6191 DecodePSHUFMask(NumElems, MaskEltSize,
6192 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6195 case X86ISD::PSHUFHW:
6196 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6197 ImmN = N->getOperand(N->getNumOperands() - 1);
6198 DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6202 case X86ISD::PSHUFLW:
6203 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6204 ImmN = N->getOperand(N->getNumOperands() - 1);
6205 DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6209 case X86ISD::VZEXT_MOVL:
6210 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6211 DecodeZeroMoveLowMask(NumElems, Mask);
6214 case X86ISD::VBROADCAST: {
6215 SDValue N0 = N->getOperand(0);
6216 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
6217 // add the pre-extracted value to the Ops vector.
6218 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6219 N0.getOperand(0).getValueType() == VT &&
6220 N0.getConstantOperandVal(1) == 0)
6221 Ops.push_back(N0.getOperand(0));
6223 // We only decode broadcasts of same-sized vectors, unless the broadcast
6224 // came from an extract from the original width. If we found one, we
6225 // pushed it the Ops vector above.
6226 if (N0.getValueType() == VT || !Ops.empty()) {
6227 DecodeVectorBroadcast(NumElems, Mask);
6233 case X86ISD::VPERMILPV: {
6234 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6236 SDValue MaskNode = N->getOperand(1);
6237 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6239 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
6244 case X86ISD::PSHUFB: {
6245 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6246 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6247 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6249 SDValue MaskNode = N->getOperand(1);
6250 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6251 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
6256 case X86ISD::VPERMI:
6257 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6258 ImmN = N->getOperand(N->getNumOperands() - 1);
6259 DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6264 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6265 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6266 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
6268 case X86ISD::VPERM2X128:
6269 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6270 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6271 ImmN = N->getOperand(N->getNumOperands() - 1);
6272 DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6274 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6276 case X86ISD::SHUF128:
6277 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6278 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6279 ImmN = N->getOperand(N->getNumOperands() - 1);
6280 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
6281 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6282 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6284 case X86ISD::MOVSLDUP:
6285 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6286 DecodeMOVSLDUPMask(NumElems, Mask);
6289 case X86ISD::MOVSHDUP:
6290 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6291 DecodeMOVSHDUPMask(NumElems, Mask);
6294 case X86ISD::MOVDDUP:
6295 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6296 DecodeMOVDDUPMask(NumElems, Mask);
6299 case X86ISD::VPERMIL2: {
6300 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6301 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6302 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6303 SDValue MaskNode = N->getOperand(2);
6304 SDValue CtrlNode = N->getOperand(3);
6305 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
6306 unsigned CtrlImm = CtrlOp->getZExtValue();
6307 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6309 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
6316 case X86ISD::VPPERM: {
6317 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6318 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6319 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6320 SDValue MaskNode = N->getOperand(2);
6321 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6322 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
6327 case X86ISD::VPERMV: {
6328 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6330 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6331 Ops.push_back(N->getOperand(1));
6332 SDValue MaskNode = N->getOperand(0);
6333 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6335 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
6340 case X86ISD::VPERMV3: {
6341 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6342 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
6343 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
6344 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6345 Ops.push_back(N->getOperand(0));
6346 Ops.push_back(N->getOperand(2));
6347 SDValue MaskNode = N->getOperand(1);
6348 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6350 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
6355 default: llvm_unreachable("unknown target shuffle node");
6358 // Empty mask indicates the decode failed.
6362 // Check if we're getting a shuffle mask with zero'd elements.
6363 if (!AllowSentinelZero)
6364 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6367 // If we have a fake unary shuffle, the shuffle mask is spread across two
6368 // inputs that are actually the same node. Re-map the mask to always point
6369 // into the first input.
6372 if (M >= (int)Mask.size())
6375 // If we didn't already add operands in the opcode-specific code, default to
6376 // adding 1 or 2 operands starting at 0.
6378 Ops.push_back(N->getOperand(0));
6379 if (!IsUnary || IsFakeUnary)
6380 Ops.push_back(N->getOperand(1));
6386 /// Check a target shuffle mask's inputs to see if we can set any values to
6387 /// SM_SentinelZero - this is for elements that are known to be zero
6388 /// (not just zeroable) from their inputs.
6389 /// Returns true if the target shuffle mask was decoded.
6390 static bool setTargetShuffleZeroElements(SDValue N,
6391 SmallVectorImpl<int> &Mask,
6392 SmallVectorImpl<SDValue> &Ops) {
6394 if (!isTargetShuffle(N.getOpcode()))
6397 MVT VT = N.getSimpleValueType();
6398 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
6401 SDValue V1 = Ops[0];
6402 SDValue V2 = IsUnary ? V1 : Ops[1];
6404 V1 = peekThroughBitcasts(V1);
6405 V2 = peekThroughBitcasts(V2);
6407 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
6408 "Illegal split of shuffle value type");
6409 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
6411 // Extract known constant input data.
6412 APInt UndefSrcElts[2];
6413 SmallVector<APInt, 32> SrcEltBits[2];
6414 bool IsSrcConstant[2] = {
6415 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6416 SrcEltBits[0], true, false),
6417 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6418 SrcEltBits[1], true, false)};
6420 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6423 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6427 // Determine shuffle input and normalize the mask.
6428 unsigned SrcIdx = M / Size;
6429 SDValue V = M < Size ? V1 : V2;
6432 // We are referencing an UNDEF input.
6434 Mask[i] = SM_SentinelUndef;
6438 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6439 // TODO: We currently only set UNDEF for integer types - floats use the same
6440 // registers as vectors and many of the scalar folded loads rely on the
6441 // SCALAR_TO_VECTOR pattern.
6442 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6443 (Size % V.getValueType().getVectorNumElements()) == 0) {
6444 int Scale = Size / V.getValueType().getVectorNumElements();
6445 int Idx = M / Scale;
6446 if (Idx != 0 && !VT.isFloatingPoint())
6447 Mask[i] = SM_SentinelUndef;
6448 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6449 Mask[i] = SM_SentinelZero;
6453 // Attempt to extract from the source's constant bits.
6454 if (IsSrcConstant[SrcIdx]) {
6455 if (UndefSrcElts[SrcIdx][M])
6456 Mask[i] = SM_SentinelUndef;
6457 else if (SrcEltBits[SrcIdx][M] == 0)
6458 Mask[i] = SM_SentinelZero;
6462 assert(VT.getVectorNumElements() == Mask.size() &&
6463 "Different mask size from vector size!");
6467 // Forward declaration (for getFauxShuffleMask recursive check).
6468 static bool resolveTargetShuffleInputs(SDValue Op,
6469 SmallVectorImpl<SDValue> &Inputs,
6470 SmallVectorImpl<int> &Mask,
6471 const SelectionDAG &DAG);
6473 // Attempt to decode ops that could be represented as a shuffle mask.
6474 // The decoded shuffle mask may contain a different number of elements to the
6475 // destination value type.
6476 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
6477 SmallVectorImpl<SDValue> &Ops,
6478 const SelectionDAG &DAG) {
6482 MVT VT = N.getSimpleValueType();
6483 unsigned NumElts = VT.getVectorNumElements();
6484 unsigned NumSizeInBits = VT.getSizeInBits();
6485 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6486 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
6487 "Expected byte aligned value types");
6489 unsigned Opcode = N.getOpcode();
6491 case ISD::VECTOR_SHUFFLE: {
6492 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6493 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6494 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6495 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6496 Ops.push_back(N.getOperand(0));
6497 Ops.push_back(N.getOperand(1));
6503 case X86ISD::ANDNP: {
6504 // Attempt to decode as a per-byte mask.
6506 SmallVector<APInt, 32> EltBits;
6507 SDValue N0 = N.getOperand(0);
6508 SDValue N1 = N.getOperand(1);
6509 bool IsAndN = (X86ISD::ANDNP == Opcode);
6510 uint64_t ZeroMask = IsAndN ? 255 : 0;
6511 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
6513 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6515 Mask.push_back(SM_SentinelUndef);
6518 uint64_t ByteBits = EltBits[i].getZExtValue();
6519 if (ByteBits != 0 && ByteBits != 255)
6521 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6523 Ops.push_back(IsAndN ? N1 : N0);
6527 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6528 // is a valid shuffle index.
6529 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
6530 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
6531 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6533 SmallVector<int, 64> SrcMask0, SrcMask1;
6534 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6535 if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
6536 !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
6538 int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6539 SmallVector<int, 64> Mask0, Mask1;
6540 scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6541 scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6542 for (int i = 0; i != MaskSize; ++i) {
6543 if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
6544 Mask.push_back(SM_SentinelUndef);
6545 else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6546 Mask.push_back(SM_SentinelZero);
6547 else if (Mask1[i] == SM_SentinelZero)
6548 Mask.push_back(Mask0[i]);
6549 else if (Mask0[i] == SM_SentinelZero)
6550 Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
6554 for (SDValue &Op : SrcInputs0)
6556 for (SDValue &Op : SrcInputs1)
6560 case ISD::INSERT_SUBVECTOR: {
6561 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where
6562 // SRC0/SRC1 are both of the same valuetype VT.
6563 // TODO - add peekThroughOneUseBitcasts support.
6564 SDValue Src = N.getOperand(0);
6565 SDValue Sub = N.getOperand(1);
6566 EVT SubVT = Sub.getValueType();
6567 unsigned NumSubElts = SubVT.getVectorNumElements();
6568 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6569 !N->isOnlyUserOf(Sub.getNode()))
6571 SmallVector<int, 64> SubMask;
6572 SmallVector<SDValue, 2> SubInputs;
6573 if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG) ||
6574 SubMask.size() != NumSubElts)
6577 for (SDValue &SubInput : SubInputs) {
6578 if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6579 SubInput.getOperand(0).getValueType() != VT ||
6580 !isa<ConstantSDNode>(SubInput.getOperand(1)))
6582 Ops.push_back(SubInput.getOperand(0));
6584 int InsertIdx = N.getConstantOperandVal(2);
6585 for (int i = 0; i != (int)NumElts; ++i)
6587 for (int i = 0; i != (int)NumSubElts; ++i) {
6590 int InputIdx = M / NumSubElts;
6591 int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
6592 M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
6594 Mask[i + InsertIdx] = M;
6598 case ISD::SCALAR_TO_VECTOR: {
6599 // Match against a scalar_to_vector of an extract from a vector,
6600 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
6601 SDValue N0 = N.getOperand(0);
6604 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6605 N0.getOperand(0).getValueType() == VT) ||
6606 (N0.getOpcode() == X86ISD::PEXTRW &&
6607 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6608 (N0.getOpcode() == X86ISD::PEXTRB &&
6609 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6613 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6616 SDValue SrcVec = SrcExtract.getOperand(0);
6617 EVT SrcVT = SrcVec.getValueType();
6618 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6619 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6621 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6622 if (NumSrcElts <= SrcIdx)
6625 Ops.push_back(SrcVec);
6626 Mask.push_back(SrcIdx);
6627 Mask.append(NumZeros, SM_SentinelZero);
6628 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6631 case X86ISD::PINSRB:
6632 case X86ISD::PINSRW: {
6633 SDValue InVec = N.getOperand(0);
6634 SDValue InScl = N.getOperand(1);
6635 SDValue InIndex = N.getOperand(2);
6636 if (!isa<ConstantSDNode>(InIndex) ||
6637 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
6639 uint64_t InIdx = N.getConstantOperandVal(2);
6641 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6642 if (X86::isZeroNode(InScl)) {
6643 Ops.push_back(InVec);
6644 for (unsigned i = 0; i != NumElts; ++i)
6645 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6649 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6650 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6652 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6653 if (InScl.getOpcode() != ExOp)
6656 SDValue ExVec = InScl.getOperand(0);
6657 SDValue ExIndex = InScl.getOperand(1);
6658 if (!isa<ConstantSDNode>(ExIndex) ||
6659 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
6661 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6663 Ops.push_back(InVec);
6664 Ops.push_back(ExVec);
6665 for (unsigned i = 0; i != NumElts; ++i)
6666 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6669 case X86ISD::PACKSS:
6670 case X86ISD::PACKUS: {
6671 SDValue N0 = N.getOperand(0);
6672 SDValue N1 = N.getOperand(1);
6673 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6674 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6675 "Unexpected input value type");
6677 // If we know input saturation won't happen we can treat this
6678 // as a truncation shuffle.
6679 if (Opcode == X86ISD::PACKSS) {
6680 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6681 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6684 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6685 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6686 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6690 bool IsUnary = (N0 == N1);
6696 createPackShuffleMask(VT, Mask, IsUnary);
6700 case X86ISD::VSRLI: {
6701 uint64_t ShiftVal = N.getConstantOperandVal(1);
6702 // Out of range bit shifts are guaranteed to be zero.
6703 if (NumBitsPerElt <= ShiftVal) {
6704 Mask.append(NumElts, SM_SentinelZero);
6708 // We can only decode 'whole byte' bit shifts as shuffles.
6709 if ((ShiftVal % 8) != 0)
6712 uint64_t ByteShift = ShiftVal / 8;
6713 unsigned NumBytes = NumSizeInBits / 8;
6714 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6715 Ops.push_back(N.getOperand(0));
6717 // Clear mask to all zeros and insert the shifted byte indices.
6718 Mask.append(NumBytes, SM_SentinelZero);
6720 if (X86ISD::VSHLI == Opcode) {
6721 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6722 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6723 Mask[i + j] = i + j - ByteShift;
6725 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6726 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6727 Mask[i + j - ByteShift] = i + j;
6731 case ISD::ZERO_EXTEND_VECTOR_INREG:
6732 case ISD::ZERO_EXTEND: {
6733 // TODO - add support for VPMOVZX with smaller input vector types.
6734 SDValue Src = N.getOperand(0);
6735 MVT SrcVT = Src.getSimpleValueType();
6736 if (NumSizeInBits != SrcVT.getSizeInBits())
6738 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6748 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6749 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6750 SmallVectorImpl<int> &Mask) {
6751 int MaskWidth = Mask.size();
6752 SmallVector<SDValue, 16> UsedInputs;
6753 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6754 int lo = UsedInputs.size() * MaskWidth;
6755 int hi = lo + MaskWidth;
6757 // Strip UNDEF input usage.
6758 if (Inputs[i].isUndef())
6760 if ((lo <= M) && (M < hi))
6761 M = SM_SentinelUndef;
6763 // Check for unused inputs.
6764 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6765 UsedInputs.push_back(Inputs[i]);
6772 Inputs = UsedInputs;
6775 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6776 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6777 /// remaining input indices in case we now have a unary shuffle and adjust the
6778 /// inputs accordingly.
6779 /// Returns true if the target shuffle mask was decoded.
6780 static bool resolveTargetShuffleInputs(SDValue Op,
6781 SmallVectorImpl<SDValue> &Inputs,
6782 SmallVectorImpl<int> &Mask,
6783 const SelectionDAG &DAG) {
6784 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6785 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6788 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6792 /// Returns the scalar element that will make up the ith
6793 /// element of the result of the vector shuffle.
6794 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6797 return SDValue(); // Limit search depth.
6799 SDValue V = SDValue(N, 0);
6800 EVT VT = V.getValueType();
6801 unsigned Opcode = V.getOpcode();
6803 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6804 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6805 int Elt = SV->getMaskElt(Index);
6808 return DAG.getUNDEF(VT.getVectorElementType());
6810 unsigned NumElems = VT.getVectorNumElements();
6811 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6812 : SV->getOperand(1);
6813 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6816 // Recurse into target specific vector shuffles to find scalars.
6817 if (isTargetShuffle(Opcode)) {
6818 MVT ShufVT = V.getSimpleValueType();
6819 MVT ShufSVT = ShufVT.getVectorElementType();
6820 int NumElems = (int)ShufVT.getVectorNumElements();
6821 SmallVector<int, 16> ShuffleMask;
6822 SmallVector<SDValue, 16> ShuffleOps;
6825 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6828 int Elt = ShuffleMask[Index];
6829 if (Elt == SM_SentinelZero)
6830 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6831 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6832 if (Elt == SM_SentinelUndef)
6833 return DAG.getUNDEF(ShufSVT);
6835 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6836 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6837 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6841 // Actual nodes that may contain scalar elements
6842 if (Opcode == ISD::BITCAST) {
6843 V = V.getOperand(0);
6844 EVT SrcVT = V.getValueType();
6845 unsigned NumElems = VT.getVectorNumElements();
6847 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6851 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6852 return (Index == 0) ? V.getOperand(0)
6853 : DAG.getUNDEF(VT.getVectorElementType());
6855 if (V.getOpcode() == ISD::BUILD_VECTOR)
6856 return V.getOperand(Index);
6861 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6862 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6863 unsigned NumNonZero, unsigned NumZero,
6865 const X86Subtarget &Subtarget) {
6866 MVT VT = Op.getSimpleValueType();
6867 unsigned NumElts = VT.getVectorNumElements();
6868 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6869 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6870 "Illegal vector insertion");
6876 for (unsigned i = 0; i < NumElts; ++i) {
6877 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6881 // If the build vector contains zeros or our first insertion is not the
6882 // first index then insert into zero vector to break any register
6883 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6886 if (NumZero || 0 != i)
6887 V = getZeroVector(VT, Subtarget, DAG, dl);
6889 assert(0 == i && "Expected insertion into zero-index");
6890 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6891 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6892 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6893 V = DAG.getBitcast(VT, V);
6897 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6898 DAG.getIntPtrConstant(i, dl));
6904 /// Custom lower build_vector of v16i8.
6905 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6906 unsigned NumNonZero, unsigned NumZero,
6908 const X86Subtarget &Subtarget) {
6909 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6912 // SSE4.1 - use PINSRB to insert each byte directly.
6913 if (Subtarget.hasSSE41())
6914 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6921 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6922 for (unsigned i = 0; i < 16; ++i) {
6923 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6924 if (ThisIsNonZero && First) {
6926 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6928 V = DAG.getUNDEF(MVT::v8i16);
6933 // FIXME: Investigate extending to i32 instead of just i16.
6934 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6935 SDValue ThisElt, LastElt;
6936 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6937 if (LastIsNonZero) {
6939 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6941 if (ThisIsNonZero) {
6942 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6943 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6944 DAG.getConstant(8, dl, MVT::i8));
6946 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6952 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6953 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6954 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6955 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6956 V = DAG.getBitcast(MVT::v8i16, V);
6958 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6959 DAG.getIntPtrConstant(i / 2, dl));
6965 return DAG.getBitcast(MVT::v16i8, V);
6968 /// Custom lower build_vector of v8i16.
6969 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6970 unsigned NumNonZero, unsigned NumZero,
6972 const X86Subtarget &Subtarget) {
6973 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6976 // Use PINSRW to insert each byte directly.
6977 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6981 /// Custom lower build_vector of v4i32 or v4f32.
6982 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6983 const X86Subtarget &Subtarget) {
6984 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6985 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6986 // Because we're creating a less complicated build vector here, we may enable
6987 // further folding of the MOVDDUP via shuffle transforms.
6988 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6989 Op.getOperand(0) == Op.getOperand(2) &&
6990 Op.getOperand(1) == Op.getOperand(3) &&
6991 Op.getOperand(0) != Op.getOperand(1)) {
6993 MVT VT = Op.getSimpleValueType();
6994 MVT EltVT = VT.getVectorElementType();
6995 // Create a new build vector with the first 2 elements followed by undef
6996 // padding, bitcast to v2f64, duplicate, and bitcast back.
6997 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6998 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6999 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7000 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7001 return DAG.getBitcast(VT, Dup);
7004 // Find all zeroable elements.
7005 std::bitset<4> Zeroable;
7006 for (int i=0; i < 4; ++i) {
7007 SDValue Elt = Op->getOperand(i);
7008 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7010 assert(Zeroable.size() - Zeroable.count() > 1 &&
7011 "We expect at least two non-zero elements!");
7013 // We only know how to deal with build_vector nodes where elements are either
7014 // zeroable or extract_vector_elt with constant index.
7015 SDValue FirstNonZero;
7016 unsigned FirstNonZeroIdx;
7017 for (unsigned i=0; i < 4; ++i) {
7020 SDValue Elt = Op->getOperand(i);
7021 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7022 !isa<ConstantSDNode>(Elt.getOperand(1)))
7024 // Make sure that this node is extracting from a 128-bit vector.
7025 MVT VT = Elt.getOperand(0).getSimpleValueType();
7026 if (!VT.is128BitVector())
7028 if (!FirstNonZero.getNode()) {
7030 FirstNonZeroIdx = i;
7034 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7035 SDValue V1 = FirstNonZero.getOperand(0);
7036 MVT VT = V1.getSimpleValueType();
7038 // See if this build_vector can be lowered as a blend with zero.
7040 unsigned EltMaskIdx, EltIdx;
7042 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7043 if (Zeroable[EltIdx]) {
7044 // The zero vector will be on the right hand side.
7045 Mask[EltIdx] = EltIdx+4;
7049 Elt = Op->getOperand(EltIdx);
7050 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7051 EltMaskIdx = Elt.getConstantOperandVal(1);
7052 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7054 Mask[EltIdx] = EltIdx;
7058 // Let the shuffle legalizer deal with blend operations.
7059 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
7060 if (V1.getSimpleValueType() != VT)
7061 V1 = DAG.getBitcast(VT, V1);
7062 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
7065 // See if we can lower this build_vector to a INSERTPS.
7066 if (!Subtarget.hasSSE41())
7069 SDValue V2 = Elt.getOperand(0);
7070 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7073 bool CanFold = true;
7074 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7078 SDValue Current = Op->getOperand(i);
7079 SDValue SrcVector = Current->getOperand(0);
7082 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
7088 assert(V1.getNode() && "Expected at least two non-zero elements!");
7089 if (V1.getSimpleValueType() != MVT::v4f32)
7090 V1 = DAG.getBitcast(MVT::v4f32, V1);
7091 if (V2.getSimpleValueType() != MVT::v4f32)
7092 V2 = DAG.getBitcast(MVT::v4f32, V2);
7094 // Ok, we can emit an INSERTPS instruction.
7095 unsigned ZMask = Zeroable.to_ulong();
7097 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7098 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7100 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7101 DAG.getIntPtrConstant(InsertPSMask, DL));
7102 return DAG.getBitcast(VT, Result);
7105 /// Return a vector logical shift node.
7106 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7107 SelectionDAG &DAG, const TargetLowering &TLI,
7109 assert(VT.is128BitVector() && "Unknown type for VShift");
7110 MVT ShVT = MVT::v16i8;
7111 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7112 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7113 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7114 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
7115 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7118 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
7119 SelectionDAG &DAG) {
7121 // Check if the scalar load can be widened into a vector load. And if
7122 // the address is "base + cst" see if the cst can be "absorbed" into
7123 // the shuffle mask.
7124 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7125 SDValue Ptr = LD->getBasePtr();
7126 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
7128 EVT PVT = LD->getValueType(0);
7129 if (PVT != MVT::i32 && PVT != MVT::f32)
7134 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7135 FI = FINode->getIndex();
7137 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7138 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7139 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7140 Offset = Ptr.getConstantOperandVal(1);
7141 Ptr = Ptr.getOperand(0);
7146 // FIXME: 256-bit vector instructions don't require a strict alignment,
7147 // improve this code to support it better.
7148 unsigned RequiredAlign = VT.getSizeInBits()/8;
7149 SDValue Chain = LD->getChain();
7150 // Make sure the stack object alignment is at least 16 or 32.
7151 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7152 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
7153 if (MFI.isFixedObjectIndex(FI)) {
7154 // Can't change the alignment. FIXME: It's possible to compute
7155 // the exact stack offset and reference FI + adjust offset instead.
7156 // If someone *really* cares about this. That's the way to implement it.
7159 MFI.setObjectAlignment(FI, RequiredAlign);
7163 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7164 // Ptr + (Offset & ~15).
7167 if ((Offset % RequiredAlign) & 3)
7169 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
7172 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7173 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7176 int EltNo = (Offset - StartOffset) >> 2;
7177 unsigned NumElems = VT.getVectorNumElements();
7179 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7180 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7181 LD->getPointerInfo().getWithOffset(StartOffset));
7183 SmallVector<int, 8> Mask(NumElems, EltNo);
7185 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7191 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7192 /// elements can be replaced by a single large load which has the same value as
7193 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7195 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7196 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
7197 const SDLoc &DL, SelectionDAG &DAG,
7198 const X86Subtarget &Subtarget,
7199 bool isAfterLegalize) {
7200 unsigned NumElems = Elts.size();
7202 int LastLoadedElt = -1;
7203 SmallBitVector LoadMask(NumElems, false);
7204 SmallBitVector ZeroMask(NumElems, false);
7205 SmallBitVector UndefMask(NumElems, false);
7207 // For each element in the initializer, see if we've found a load, zero or an
7209 for (unsigned i = 0; i < NumElems; ++i) {
7210 SDValue Elt = peekThroughBitcasts(Elts[i]);
7215 UndefMask[i] = true;
7216 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
7218 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
7221 // Each loaded element must be the correct fractional portion of the
7222 // requested vector load.
7223 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
7228 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
7229 "Incomplete element masks");
7231 // Handle Special Cases - all undef or undef/zero.
7232 if (UndefMask.count() == NumElems)
7233 return DAG.getUNDEF(VT);
7235 // FIXME: Should we return this as a BUILD_VECTOR instead?
7236 if ((ZeroMask | UndefMask).count() == NumElems)
7237 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7238 : DAG.getConstantFP(0.0, DL, VT);
7240 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7241 int FirstLoadedElt = LoadMask.find_first();
7242 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7243 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
7244 EVT LDBaseVT = EltBase.getValueType();
7246 // Consecutive loads can contain UNDEFS but not ZERO elements.
7247 // Consecutive loads with UNDEFs and ZEROs elements require a
7248 // an additional shuffle stage to clear the ZERO elements.
7249 bool IsConsecutiveLoad = true;
7250 bool IsConsecutiveLoadWithZeros = true;
7251 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7253 SDValue Elt = peekThroughBitcasts(Elts[i]);
7254 LoadSDNode *LD = cast<LoadSDNode>(Elt);
7255 if (!DAG.areNonVolatileConsecutiveLoads(
7256 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
7257 i - FirstLoadedElt)) {
7258 IsConsecutiveLoad = false;
7259 IsConsecutiveLoadWithZeros = false;
7262 } else if (ZeroMask[i]) {
7263 IsConsecutiveLoad = false;
7267 SmallVector<LoadSDNode *, 8> Loads;
7268 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
7270 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
7272 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7273 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7274 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
7275 "Cannot merge volatile loads.");
7277 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7278 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
7279 for (auto *LD : Loads)
7280 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7284 // LOAD - all consecutive load/undefs (must start/end with a load).
7285 // If we have found an entire vector of loads and undefs, then return a large
7286 // load of the entire vector width starting at the base pointer.
7287 // If the vector contains zeros, then attempt to shuffle those elements.
7288 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
7289 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7290 assert(LDBase && "Did not find base load for merging consecutive loads");
7291 EVT EltVT = LDBase->getValueType(0);
7292 // Ensure that the input vector size for the merged loads matches the
7293 // cumulative size of the input elements.
7294 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
7297 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7300 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7301 // will lower to regular temporal loads and use the cache.
7302 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
7303 VT.is256BitVector() && !Subtarget.hasInt256())
7306 if (IsConsecutiveLoad)
7307 return CreateLoad(VT, LDBase);
7309 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7310 // vector and a zero vector to clear out the zero elements.
7311 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
7312 SmallVector<int, 4> ClearMask(NumElems, -1);
7313 for (unsigned i = 0; i < NumElems; ++i) {
7315 ClearMask[i] = i + NumElems;
7316 else if (LoadMask[i])
7319 SDValue V = CreateLoad(VT, LDBase);
7320 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7321 : DAG.getConstantFP(0.0, DL, VT);
7322 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7327 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
7329 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7330 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7331 (LoadSize == 32 || LoadSize == 64) &&
7332 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7333 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
7334 : MVT::getIntegerVT(LoadSize);
7335 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
7336 if (TLI.isTypeLegal(VecVT)) {
7337 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7338 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7340 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
7341 LDBase->getPointerInfo(),
7342 LDBase->getAlignment(),
7343 MachineMemOperand::MOLoad);
7344 for (auto *LD : Loads)
7345 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7346 return DAG.getBitcast(VT, ResNode);
7353 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7354 unsigned SplatBitSize, LLVMContext &C) {
7355 unsigned ScalarSize = VT.getScalarSizeInBits();
7356 unsigned NumElm = SplatBitSize / ScalarSize;
7358 SmallVector<Constant *, 32> ConstantVec;
7359 for (unsigned i = 0; i < NumElm; i++) {
7360 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
7362 if (VT.isFloatingPoint()) {
7363 if (ScalarSize == 32) {
7364 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7366 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7367 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7370 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7371 ConstantVec.push_back(Const);
7373 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7376 static bool isUseOfShuffle(SDNode *N) {
7377 for (auto *U : N->uses()) {
7378 if (isTargetShuffle(U->getOpcode()))
7380 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
7381 return isUseOfShuffle(U);
7386 // Check if the current node of build vector is a zero extended vector.
7387 // // If so, return the value extended.
7388 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
7389 // // NumElt - return the number of zero extended identical values.
7390 // // EltType - return the type of the value include the zero extend.
7391 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
7392 unsigned &NumElt, MVT &EltType) {
7393 SDValue ExtValue = Op->getOperand(0);
7394 unsigned NumElts = Op->getNumOperands();
7395 unsigned Delta = NumElts;
7397 for (unsigned i = 1; i < NumElts; i++) {
7398 if (Op->getOperand(i) == ExtValue) {
7402 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
7405 if (!isPowerOf2_32(Delta) || Delta == 1)
7408 for (unsigned i = Delta; i < NumElts; i++) {
7409 if (i % Delta == 0) {
7410 if (Op->getOperand(i) != ExtValue)
7412 } else if (!(isNullConstant(Op->getOperand(i)) ||
7413 Op->getOperand(i).isUndef()))
7416 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
7417 unsigned ExtVTSize = EltSize * Delta;
7418 EltType = MVT::getIntegerVT(ExtVTSize);
7419 NumElt = NumElts / Delta;
7423 /// Attempt to use the vbroadcast instruction to generate a splat value
7424 /// from a splat BUILD_VECTOR which uses:
7425 /// a. A single scalar load, or a constant.
7426 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7428 /// The VBROADCAST node is returned when a pattern is found,
7429 /// or SDValue() otherwise.
7430 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
7431 const X86Subtarget &Subtarget,
7432 SelectionDAG &DAG) {
7433 // VBROADCAST requires AVX.
7434 // TODO: Splats could be generated for non-AVX CPUs using SSE
7435 // instructions, but there's less potential gain for only 128-bit vectors.
7436 if (!Subtarget.hasAVX())
7439 MVT VT = BVOp->getSimpleValueType(0);
7442 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7443 "Unsupported vector type for broadcast.");
7445 BitVector UndefElements;
7446 SDValue Ld = BVOp->getSplatValue(&UndefElements);
7448 // Attempt to use VBROADCASTM
7449 // From this paterrn:
7450 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7451 // b. t1 = (build_vector t0 t0)
7453 // Create (VBROADCASTM v2i1 X)
7454 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
7455 MVT EltType = VT.getScalarType();
7456 unsigned NumElts = VT.getVectorNumElements();
7458 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
7459 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
7460 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
7461 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
7463 BOperand = ZeroExtended.getOperand(0);
7465 BOperand = Ld.getOperand(0).getOperand(0);
7466 MVT MaskVT = BOperand.getSimpleValueType();
7467 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7468 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7470 DAG.getNode(X86ISD::VBROADCASTM, dl,
7471 MVT::getVectorVT(EltType, NumElts), BOperand);
7472 return DAG.getBitcast(VT, Brdcst);
7477 unsigned NumElts = VT.getVectorNumElements();
7478 unsigned NumUndefElts = UndefElements.count();
7479 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7480 APInt SplatValue, Undef;
7481 unsigned SplatBitSize;
7483 // Check if this is a repeated constant pattern suitable for broadcasting.
7484 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7485 SplatBitSize > VT.getScalarSizeInBits() &&
7486 SplatBitSize < VT.getSizeInBits()) {
7487 // Avoid replacing with broadcast when it's a use of a shuffle
7488 // instruction to preserve the present custom lowering of shuffles.
7489 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
7491 // replace BUILD_VECTOR with broadcast of the repeated constants.
7492 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7493 LLVMContext *Ctx = DAG.getContext();
7494 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7495 if (Subtarget.hasAVX()) {
7496 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
7497 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
7498 // Splatted value can fit in one INTEGER constant in constant pool.
7499 // Load the constant and broadcast it.
7500 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7501 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
7502 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
7503 SDValue CP = DAG.getConstantPool(C, PVT);
7504 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7506 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7508 CVT, dl, DAG.getEntryNode(), CP,
7509 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7511 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7512 MVT::getVectorVT(CVT, Repeat), Ld);
7513 return DAG.getBitcast(VT, Brdcst);
7514 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
7515 // Splatted value can fit in one FLOAT constant in constant pool.
7516 // Load the constant and broadcast it.
7517 // AVX have support for 32 and 64 bit broadcast for floats only.
7518 // No 64bit integer in 32bit subtarget.
7519 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
7520 // Lower the splat via APFloat directly, to avoid any conversion.
7523 ? ConstantFP::get(*Ctx,
7524 APFloat(APFloat::IEEEsingle(), SplatValue))
7525 : ConstantFP::get(*Ctx,
7526 APFloat(APFloat::IEEEdouble(), SplatValue));
7527 SDValue CP = DAG.getConstantPool(C, PVT);
7528 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7530 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7532 CVT, dl, DAG.getEntryNode(), CP,
7533 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7535 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7536 MVT::getVectorVT(CVT, Repeat), Ld);
7537 return DAG.getBitcast(VT, Brdcst);
7538 } else if (SplatBitSize > 64) {
7539 // Load the vector of constants and broadcast it.
7540 MVT CVT = VT.getScalarType();
7541 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
7543 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7544 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7545 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
7547 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
7548 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7550 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
7551 return DAG.getBitcast(VT, Brdcst);
7556 // If we are moving a scalar into a vector (Ld must be set and all elements
7557 // but 1 are undef) and that operation is not obviously supported by
7558 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7559 // That's better than general shuffling and may eliminate a load to GPR and
7560 // move from scalar to vector register.
7561 if (!Ld || NumElts - NumUndefElts != 1)
7563 unsigned ScalarSize = Ld.getValueSizeInBits();
7564 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7568 bool ConstSplatVal =
7569 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7571 // Make sure that all of the users of a non-constant load are from the
7572 // BUILD_VECTOR node.
7573 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
7576 unsigned ScalarSize = Ld.getValueSizeInBits();
7577 bool IsGE256 = (VT.getSizeInBits() >= 256);
7579 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7580 // instruction to save 8 or more bytes of constant pool data.
7581 // TODO: If multiple splats are generated to load the same constant,
7582 // it may be detrimental to overall size. There needs to be a way to detect
7583 // that condition to know if this is truly a size win.
7584 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
7586 // Handle broadcasting a single constant scalar from the constant pool
7588 // On Sandybridge (no AVX2), it is still better to load a constant vector
7589 // from the constant pool and not to broadcast it from a scalar.
7590 // But override that restriction when optimizing for size.
7591 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7592 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7593 EVT CVT = Ld.getValueType();
7594 assert(!CVT.isVector() && "Must not broadcast a vector type");
7596 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
7597 // For size optimization, also splat v2f64 and v2i64, and for size opt
7598 // with AVX2, also splat i8 and i16.
7599 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7600 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7601 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7602 const Constant *C = nullptr;
7603 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7604 C = CI->getConstantIntValue();
7605 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7606 C = CF->getConstantFPValue();
7608 assert(C && "Invalid constant type");
7610 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7612 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7613 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7615 CVT, dl, DAG.getEntryNode(), CP,
7616 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7619 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7623 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7625 // Handle AVX2 in-register broadcasts.
7626 if (!IsLoad && Subtarget.hasInt256() &&
7627 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7628 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7630 // The scalar source must be a normal load.
7634 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7635 (Subtarget.hasVLX() && ScalarSize == 64))
7636 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7638 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7639 // double since there is no vbroadcastsd xmm
7640 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7641 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7642 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7645 // Unsupported broadcast.
7649 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
7650 /// underlying vector and index.
7652 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7654 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7656 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7657 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7660 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7662 // (extract_vector_elt (v8f32 %1), Constant<6>)
7664 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7665 // (extract_subvector (v8f32 %0), Constant<4>),
7668 // In this case the vector is the extract_subvector expression and the index
7669 // is 2, as specified by the shuffle.
7670 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7671 SDValue ShuffleVec = SVOp->getOperand(0);
7672 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7673 assert(ShuffleVecVT.getVectorElementType() ==
7674 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7676 int ShuffleIdx = SVOp->getMaskElt(Idx);
7677 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7678 ExtractedFromVec = ShuffleVec;
7684 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7685 MVT VT = Op.getSimpleValueType();
7687 // Skip if insert_vec_elt is not supported.
7688 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7689 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7693 unsigned NumElems = Op.getNumOperands();
7697 SmallVector<unsigned, 4> InsertIndices;
7698 SmallVector<int, 8> Mask(NumElems, -1);
7700 for (unsigned i = 0; i != NumElems; ++i) {
7701 unsigned Opc = Op.getOperand(i).getOpcode();
7703 if (Opc == ISD::UNDEF)
7706 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7707 // Quit if more than 1 elements need inserting.
7708 if (InsertIndices.size() > 1)
7711 InsertIndices.push_back(i);
7715 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7716 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7718 // Quit if non-constant index.
7719 if (!isa<ConstantSDNode>(ExtIdx))
7721 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7723 // Quit if extracted from vector of different type.
7724 if (ExtractedFromVec.getValueType() != VT)
7727 if (!VecIn1.getNode())
7728 VecIn1 = ExtractedFromVec;
7729 else if (VecIn1 != ExtractedFromVec) {
7730 if (!VecIn2.getNode())
7731 VecIn2 = ExtractedFromVec;
7732 else if (VecIn2 != ExtractedFromVec)
7733 // Quit if more than 2 vectors to shuffle
7737 if (ExtractedFromVec == VecIn1)
7739 else if (ExtractedFromVec == VecIn2)
7740 Mask[i] = Idx + NumElems;
7743 if (!VecIn1.getNode())
7746 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7747 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7749 for (unsigned Idx : InsertIndices)
7750 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7751 DAG.getIntPtrConstant(Idx, DL));
7756 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7757 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7758 Op.getScalarValueSizeInBits() == 1 &&
7759 "Can not convert non-constant vector");
7760 uint64_t Immediate = 0;
7761 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7762 SDValue In = Op.getOperand(idx);
7764 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7767 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7768 return DAG.getConstant(Immediate, dl, VT);
7770 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7771 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7772 const X86Subtarget &Subtarget) {
7774 MVT VT = Op.getSimpleValueType();
7775 assert((VT.getVectorElementType() == MVT::i1) &&
7776 "Unexpected type in LowerBUILD_VECTORvXi1!");
7779 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7782 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7785 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7786 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7787 // Split the pieces.
7789 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7791 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7792 // We have to manually lower both halves so getNode doesn't try to
7793 // reassemble the build_vector.
7794 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7795 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7796 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7798 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7799 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7800 return DAG.getBitcast(VT, Imm);
7801 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7802 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7803 DAG.getIntPtrConstant(0, dl));
7806 // Vector has one or more non-const elements
7807 uint64_t Immediate = 0;
7808 SmallVector<unsigned, 16> NonConstIdx;
7809 bool IsSplat = true;
7810 bool HasConstElts = false;
7812 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7813 SDValue In = Op.getOperand(idx);
7816 if (!isa<ConstantSDNode>(In))
7817 NonConstIdx.push_back(idx);
7819 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7820 HasConstElts = true;
7824 else if (In != Op.getOperand(SplatIdx))
7828 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7830 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7831 DAG.getConstant(1, dl, VT),
7832 DAG.getConstant(0, dl, VT));
7834 // insert elements one by one
7838 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7839 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7841 else if (HasConstElts)
7842 Imm = DAG.getConstant(0, dl, VT);
7844 Imm = DAG.getUNDEF(VT);
7845 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7846 DstVec = DAG.getBitcast(VT, Imm);
7848 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7849 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7850 DAG.getIntPtrConstant(0, dl));
7853 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7854 unsigned InsertIdx = NonConstIdx[i];
7855 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7856 Op.getOperand(InsertIdx),
7857 DAG.getIntPtrConstant(InsertIdx, dl));
7862 /// This is a helper function of LowerToHorizontalOp().
7863 /// This function checks that the build_vector \p N in input implements a
7864 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7865 /// may not match the layout of an x86 256-bit horizontal instruction.
7866 /// In other words, if this returns true, then some extraction/insertion will
7867 /// be required to produce a valid horizontal instruction.
7869 /// Parameter \p Opcode defines the kind of horizontal operation to match.
7870 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7871 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7872 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7875 /// This function only analyzes elements of \p N whose indices are
7876 /// in range [BaseIdx, LastIdx).
7878 /// TODO: This function was originally used to match both real and fake partial
7879 /// horizontal operations, but the index-matching logic is incorrect for that.
7880 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
7881 /// code because it is only used for partial h-op matching now?
7882 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7884 unsigned BaseIdx, unsigned LastIdx,
7885 SDValue &V0, SDValue &V1) {
7886 EVT VT = N->getValueType(0);
7887 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7888 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7889 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7890 "Invalid Vector in input!");
7892 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7893 bool CanFold = true;
7894 unsigned ExpectedVExtractIdx = BaseIdx;
7895 unsigned NumElts = LastIdx - BaseIdx;
7896 V0 = DAG.getUNDEF(VT);
7897 V1 = DAG.getUNDEF(VT);
7899 // Check if N implements a horizontal binop.
7900 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7901 SDValue Op = N->getOperand(i + BaseIdx);
7904 if (Op->isUndef()) {
7905 // Update the expected vector extract index.
7906 if (i * 2 == NumElts)
7907 ExpectedVExtractIdx = BaseIdx;
7908 ExpectedVExtractIdx += 2;
7912 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7917 SDValue Op0 = Op.getOperand(0);
7918 SDValue Op1 = Op.getOperand(1);
7920 // Try to match the following pattern:
7921 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7922 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7923 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7924 Op0.getOperand(0) == Op1.getOperand(0) &&
7925 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7926 isa<ConstantSDNode>(Op1.getOperand(1)));
7930 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7931 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7933 if (i * 2 < NumElts) {
7935 V0 = Op0.getOperand(0);
7936 if (V0.getValueType() != VT)
7941 V1 = Op0.getOperand(0);
7942 if (V1.getValueType() != VT)
7945 if (i * 2 == NumElts)
7946 ExpectedVExtractIdx = BaseIdx;
7949 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7950 if (I0 == ExpectedVExtractIdx)
7951 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7952 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7953 // Try to match the following dag sequence:
7954 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7955 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7959 ExpectedVExtractIdx += 2;
7965 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7966 /// a concat_vector.
7968 /// This is a helper function of LowerToHorizontalOp().
7969 /// This function expects two 256-bit vectors called V0 and V1.
7970 /// At first, each vector is split into two separate 128-bit vectors.
7971 /// Then, the resulting 128-bit vectors are used to implement two
7972 /// horizontal binary operations.
7974 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7976 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7977 /// the two new horizontal binop.
7978 /// When Mode is set, the first horizontal binop dag node would take as input
7979 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7980 /// horizontal binop dag node would take as input the lower 128-bit of V1
7981 /// and the upper 128-bit of V1.
7983 /// HADD V0_LO, V0_HI
7984 /// HADD V1_LO, V1_HI
7986 /// Otherwise, the first horizontal binop dag node takes as input the lower
7987 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7988 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7990 /// HADD V0_LO, V1_LO
7991 /// HADD V0_HI, V1_HI
7993 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7994 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7995 /// the upper 128-bits of the result.
7996 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7997 const SDLoc &DL, SelectionDAG &DAG,
7998 unsigned X86Opcode, bool Mode,
7999 bool isUndefLO, bool isUndefHI) {
8000 MVT VT = V0.getSimpleValueType();
8001 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8002 "Invalid nodes in input!");
8004 unsigned NumElts = VT.getVectorNumElements();
8005 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8006 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8007 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8008 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8009 MVT NewVT = V0_LO.getSimpleValueType();
8011 SDValue LO = DAG.getUNDEF(NewVT);
8012 SDValue HI = DAG.getUNDEF(NewVT);
8015 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8016 if (!isUndefLO && !V0->isUndef())
8017 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8018 if (!isUndefHI && !V1->isUndef())
8019 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8021 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8022 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8023 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8025 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8026 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8029 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8032 /// Returns true iff \p BV builds a vector with the result equivalent to
8033 /// the result of ADDSUB/SUBADD operation.
8034 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8035 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8036 /// \p Opnd0 and \p Opnd1.
8037 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
8038 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8039 SDValue &Opnd0, SDValue &Opnd1,
8040 unsigned &NumExtracts,
8043 MVT VT = BV->getSimpleValueType(0);
8044 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8047 unsigned NumElts = VT.getVectorNumElements();
8048 SDValue InVec0 = DAG.getUNDEF(VT);
8049 SDValue InVec1 = DAG.getUNDEF(VT);
8053 // Odd-numbered elements in the input build vector are obtained from
8054 // adding/subtracting two integer/float elements.
8055 // Even-numbered elements in the input build vector are obtained from
8056 // subtracting/adding two integer/float elements.
8057 unsigned Opc[2] = {0, 0};
8058 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8059 SDValue Op = BV->getOperand(i);
8061 // Skip 'undef' values.
8062 unsigned Opcode = Op.getOpcode();
8063 if (Opcode == ISD::UNDEF)
8066 // Early exit if we found an unexpected opcode.
8067 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8070 SDValue Op0 = Op.getOperand(0);
8071 SDValue Op1 = Op.getOperand(1);
8073 // Try to match the following pattern:
8074 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8075 // Early exit if we cannot match that sequence.
8076 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8077 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8078 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8079 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
8080 Op0.getOperand(1) != Op1.getOperand(1))
8083 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
8087 // We found a valid add/sub node, make sure its the same opcode as previous
8088 // elements for this parity.
8089 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8091 Opc[i % 2] = Opcode;
8093 // Update InVec0 and InVec1.
8094 if (InVec0.isUndef()) {
8095 InVec0 = Op0.getOperand(0);
8096 if (InVec0.getSimpleValueType() != VT)
8099 if (InVec1.isUndef()) {
8100 InVec1 = Op1.getOperand(0);
8101 if (InVec1.getSimpleValueType() != VT)
8105 // Make sure that operands in input to each add/sub node always
8106 // come from a same pair of vectors.
8107 if (InVec0 != Op0.getOperand(0)) {
8108 if (Opcode == ISD::FSUB)
8111 // FADD is commutable. Try to commute the operands
8112 // and then test again.
8113 std::swap(Op0, Op1);
8114 if (InVec0 != Op0.getOperand(0))
8118 if (InVec1 != Op1.getOperand(0))
8121 // Increment the number of extractions done.
8125 // Ensure we have found an opcode for both parities and that they are
8126 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8127 // inputs are undef.
8128 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8129 InVec0.isUndef() || InVec1.isUndef())
8132 IsSubAdd = Opc[0] == ISD::FADD;
8139 /// Returns true if is possible to fold MUL and an idiom that has already been
8140 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8141 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8142 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8144 /// Prior to calling this function it should be known that there is some
8145 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8146 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8147 /// before replacement of such SDNode with ADDSUB operation. Thus the number
8148 /// of \p Opnd0 uses is expected to be equal to 2.
8149 /// For example, this function may be called for the following IR:
8150 /// %AB = fmul fast <2 x double> %A, %B
8151 /// %Sub = fsub fast <2 x double> %AB, %C
8152 /// %Add = fadd fast <2 x double> %AB, %C
8153 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8154 /// <2 x i32> <i32 0, i32 3>
8155 /// There is a def for %Addsub here, which potentially can be replaced by
8156 /// X86ISD::ADDSUB operation:
8157 /// %Addsub = X86ISD::ADDSUB %AB, %C
8158 /// and such ADDSUB can further be replaced with FMADDSUB:
8159 /// %Addsub = FMADDSUB %A, %B, %C.
8161 /// The main reason why this method is called before the replacement of the
8162 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8163 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8165 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8167 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8168 unsigned ExpectedUses) {
8169 if (Opnd0.getOpcode() != ISD::FMUL ||
8170 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8173 // FIXME: These checks must match the similar ones in
8174 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8175 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8176 // or MUL + ADDSUB to FMADDSUB.
8177 const TargetOptions &Options = DAG.getTarget().Options;
8179 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8184 Opnd1 = Opnd0.getOperand(1);
8185 Opnd0 = Opnd0.getOperand(0);
8190 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8191 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8192 /// X86ISD::FMSUBADD node.
8193 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
8194 const X86Subtarget &Subtarget,
8195 SelectionDAG &DAG) {
8196 SDValue Opnd0, Opnd1;
8197 unsigned NumExtracts;
8199 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8203 MVT VT = BV->getSimpleValueType(0);
8206 // Try to generate X86ISD::FMADDSUB node here.
8208 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8209 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8210 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8213 // We only support ADDSUB.
8217 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
8218 // the ADDSUB idiom has been successfully recognized. There are no known
8219 // X86 targets with 512-bit ADDSUB instructions!
8220 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
8222 if (VT.is512BitVector())
8225 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8228 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
8229 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8230 // Initialize outputs to known values.
8231 MVT VT = BV->getSimpleValueType(0);
8232 HOpcode = ISD::DELETED_NODE;
8233 V0 = DAG.getUNDEF(VT);
8234 V1 = DAG.getUNDEF(VT);
8236 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8237 // half of the result is calculated independently from the 128-bit halves of
8238 // the inputs, so that makes the index-checking logic below more complicated.
8239 unsigned NumElts = VT.getVectorNumElements();
8240 unsigned GenericOpcode = ISD::DELETED_NODE;
8241 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8242 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8243 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8244 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8245 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8246 // Ignore undef elements.
8247 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8251 // If there's an opcode mismatch, we're done.
8252 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8255 // Initialize horizontal opcode.
8256 if (HOpcode == ISD::DELETED_NODE) {
8257 GenericOpcode = Op.getOpcode();
8258 switch (GenericOpcode) {
8259 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8260 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8261 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8262 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8263 default: return false;
8267 SDValue Op0 = Op.getOperand(0);
8268 SDValue Op1 = Op.getOperand(1);
8269 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8270 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8271 Op0.getOperand(0) != Op1.getOperand(0) ||
8272 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8273 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8276 // The source vector is chosen based on which 64-bit half of the
8277 // destination vector is being calculated.
8278 if (j < NumEltsIn64Bits) {
8280 V0 = Op0.getOperand(0);
8283 V1 = Op0.getOperand(0);
8286 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8287 if (SourceVec != Op0.getOperand(0))
8290 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8291 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8292 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8293 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8294 (j % NumEltsIn64Bits) * 2;
8295 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8298 // If this is not a commutative op, this does not match.
8299 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8302 // Addition is commutative, so try swapping the extract indexes.
8303 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8304 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8307 // Extract indexes do not match horizontal requirement.
8311 // We matched. Opcode and operands are returned by reference as arguments.
8315 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
8316 SelectionDAG &DAG, unsigned HOpcode,
8317 SDValue V0, SDValue V1) {
8318 // If either input vector is not the same size as the build vector,
8319 // extract/insert the low bits to the correct size.
8320 // This is free (examples: zmm --> xmm, xmm --> ymm).
8321 MVT VT = BV->getSimpleValueType(0);
8322 unsigned Width = VT.getSizeInBits();
8323 if (V0.getValueSizeInBits() > Width)
8324 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
8325 else if (V0.getValueSizeInBits() < Width)
8326 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
8328 if (V1.getValueSizeInBits() > Width)
8329 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
8330 else if (V1.getValueSizeInBits() < Width)
8331 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
8333 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
8336 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8337 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
8338 const X86Subtarget &Subtarget,
8339 SelectionDAG &DAG) {
8340 // We need at least 2 non-undef elements to make this worthwhile by default.
8341 unsigned NumNonUndefs = 0;
8342 for (const SDValue &V : BV->op_values())
8346 if (NumNonUndefs < 2)
8349 // There are 4 sets of horizontal math operations distinguished by type:
8350 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8351 // subtarget feature. Try to match those "native" patterns first.
8352 MVT VT = BV->getSimpleValueType(0);
8355 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3())
8356 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8357 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
8359 if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3())
8360 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8361 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
8363 if ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX())
8364 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8365 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
8367 if ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())
8368 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8369 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
8371 // Try harder to match 256-bit ops by using extract/concat.
8372 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8375 // Count the number of UNDEF operands in the build_vector in input.
8376 unsigned NumElts = VT.getVectorNumElements();
8377 unsigned Half = NumElts / 2;
8378 unsigned NumUndefsLO = 0;
8379 unsigned NumUndefsHI = 0;
8380 for (unsigned i = 0, e = Half; i != e; ++i)
8381 if (BV->getOperand(i)->isUndef())
8384 for (unsigned i = Half, e = NumElts; i != e; ++i)
8385 if (BV->getOperand(i)->isUndef())
8389 SDValue InVec0, InVec1;
8390 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8391 SDValue InVec2, InVec3;
8393 bool CanFold = true;
8395 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
8396 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
8398 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8399 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8400 X86Opcode = X86ISD::HADD;
8401 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
8403 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
8405 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8406 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8407 X86Opcode = X86ISD::HSUB;
8412 // Do not try to expand this build_vector into a pair of horizontal
8413 // add/sub if we can emit a pair of scalar add/sub.
8414 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8417 // Convert this build_vector into a pair of horizontal binops followed by
8418 // a concat vector. We must adjust the outputs from the partial horizontal
8419 // matching calls above to account for undefined vector halves.
8420 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8421 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8422 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8423 bool isUndefLO = NumUndefsLO == Half;
8424 bool isUndefHI = NumUndefsHI == Half;
8425 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8430 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8431 VT == MVT::v16i16) {
8433 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
8434 X86Opcode = X86ISD::HADD;
8435 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
8437 X86Opcode = X86ISD::HSUB;
8438 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
8440 X86Opcode = X86ISD::FHADD;
8441 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
8443 X86Opcode = X86ISD::FHSUB;
8447 // Don't try to expand this build_vector into a pair of horizontal add/sub
8448 // if we can simply emit a pair of scalar add/sub.
8449 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8452 // Convert this build_vector into two horizontal add/sub followed by
8454 bool isUndefLO = NumUndefsLO == Half;
8455 bool isUndefHI = NumUndefsHI == Half;
8456 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8457 isUndefLO, isUndefHI);
8463 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
8464 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8465 /// just apply the bit to the vectors.
8466 /// NOTE: Its not in our interest to start make a general purpose vectorizer
8467 /// from this, but enough scalar bit operations are created from the later
8468 /// legalization + scalarization stages to need basic support.
8469 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
8470 SelectionDAG &DAG) {
8472 MVT VT = Op->getSimpleValueType(0);
8473 unsigned NumElems = VT.getVectorNumElements();
8474 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8476 // Check that all elements have the same opcode.
8477 // TODO: Should we allow UNDEFS and if so how many?
8478 unsigned Opcode = Op->getOperand(0).getOpcode();
8479 for (unsigned i = 1; i < NumElems; ++i)
8480 if (Opcode != Op->getOperand(i).getOpcode())
8483 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8490 // Don't do this if the buildvector is a splat - we'd replace one
8491 // constant with an entire vector.
8492 if (Op->getSplatValue())
8494 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8499 SmallVector<SDValue, 4> LHSElts, RHSElts;
8500 for (SDValue Elt : Op->ops()) {
8501 SDValue LHS = Elt.getOperand(0);
8502 SDValue RHS = Elt.getOperand(1);
8504 // We expect the canonicalized RHS operand to be the constant.
8505 if (!isa<ConstantSDNode>(RHS))
8507 LHSElts.push_back(LHS);
8508 RHSElts.push_back(RHS);
8511 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8512 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8513 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
8516 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
8517 /// functionality to do this, so it's all zeros, all ones, or some derivation
8518 /// that is cheap to calculate.
8519 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
8520 const X86Subtarget &Subtarget) {
8522 MVT VT = Op.getSimpleValueType();
8524 // Vectors containing all zeros can be matched by pxor and xorps.
8525 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
8526 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
8527 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
8528 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8531 return getZeroVector(VT, Subtarget, DAG, DL);
8534 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8535 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8536 // vpcmpeqd on 256-bit vectors.
8537 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8538 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
8539 (VT == MVT::v8i32 && Subtarget.hasInt256()))
8542 return getOnesVector(VT, DAG, DL);
8548 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8549 /// from a vector of source values and a vector of extraction indices.
8550 /// The vectors might be manipulated to match the type of the permute op.
8551 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8552 SDLoc &DL, SelectionDAG &DAG,
8553 const X86Subtarget &Subtarget) {
8555 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8556 unsigned NumElts = VT.getVectorNumElements();
8557 unsigned SizeInBits = VT.getSizeInBits();
8559 // Adjust IndicesVec to match VT size.
8560 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8561 "Illegal variable permute mask size");
8562 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8563 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8564 NumElts * VT.getScalarSizeInBits());
8565 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8567 // Handle SrcVec that don't match VT type.
8568 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8569 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8570 // Handle larger SrcVec by treating it as a larger permute.
8571 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8572 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8573 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8574 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8575 Subtarget, DAG, SDLoc(IndicesVec));
8576 return extractSubVector(
8577 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
8578 DAG, DL, SizeInBits);
8579 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8580 // Widen smaller SrcVec to match VT.
8581 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8586 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8587 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8588 EVT SrcVT = Idx.getValueType();
8589 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8590 uint64_t IndexScale = 0;
8591 uint64_t IndexOffset = 0;
8593 // If we're scaling a smaller permute op, then we need to repeat the
8594 // indices, scaling and offsetting them as well.
8595 // e.g. v4i32 -> v16i8 (Scale = 4)
8596 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8597 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8598 for (uint64_t i = 0; i != Scale; ++i) {
8599 IndexScale |= Scale << (i * NumDstBits);
8600 IndexOffset |= i << (i * NumDstBits);
8603 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8604 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8605 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8606 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8610 unsigned Opcode = 0;
8611 switch (VT.SimpleTy) {
8615 if (Subtarget.hasSSSE3())
8616 Opcode = X86ISD::PSHUFB;
8619 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8620 Opcode = X86ISD::VPERMV;
8621 else if (Subtarget.hasSSSE3()) {
8622 Opcode = X86ISD::PSHUFB;
8623 ShuffleVT = MVT::v16i8;
8628 if (Subtarget.hasAVX()) {
8629 Opcode = X86ISD::VPERMILPV;
8630 ShuffleVT = MVT::v4f32;
8631 } else if (Subtarget.hasSSSE3()) {
8632 Opcode = X86ISD::PSHUFB;
8633 ShuffleVT = MVT::v16i8;
8638 if (Subtarget.hasAVX()) {
8639 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8640 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8641 Opcode = X86ISD::VPERMILPV;
8642 ShuffleVT = MVT::v2f64;
8643 } else if (Subtarget.hasSSE41()) {
8644 // SSE41 can compare v2i64 - select between indices 0 and 1.
8645 return DAG.getSelectCC(
8647 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8648 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8649 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8650 ISD::CondCode::SETEQ);
8654 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8655 Opcode = X86ISD::VPERMV;
8656 else if (Subtarget.hasXOP()) {
8657 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8658 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8659 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8660 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8662 ISD::CONCAT_VECTORS, DL, VT,
8663 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8664 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8665 } else if (Subtarget.hasAVX()) {
8666 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8667 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8668 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8669 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8670 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8671 ArrayRef<SDValue> Ops) {
8672 // Permute Lo and Hi and then select based on index range.
8673 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8674 // care about the bit[7] as its just an index vector.
8675 SDValue Idx = Ops[2];
8676 EVT VT = Idx.getValueType();
8677 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8678 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8679 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8680 ISD::CondCode::SETGT);
8682 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8683 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8688 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8689 Opcode = X86ISD::VPERMV;
8690 else if (Subtarget.hasAVX()) {
8691 // Scale to v32i8 and perform as v32i8.
8692 IndicesVec = ScaleIndices(IndicesVec, 2);
8693 return DAG.getBitcast(
8694 VT, createVariablePermute(
8695 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8696 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8701 if (Subtarget.hasAVX2())
8702 Opcode = X86ISD::VPERMV;
8703 else if (Subtarget.hasAVX()) {
8704 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8705 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8706 {0, 1, 2, 3, 0, 1, 2, 3});
8707 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8708 {4, 5, 6, 7, 4, 5, 6, 7});
8709 if (Subtarget.hasXOP())
8710 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8711 LoLo, HiHi, IndicesVec,
8712 DAG.getConstant(0, DL, MVT::i8)));
8713 // Permute Lo and Hi and then select based on index range.
8714 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8715 SDValue Res = DAG.getSelectCC(
8716 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8717 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8718 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8719 ISD::CondCode::SETGT);
8720 return DAG.getBitcast(VT, Res);
8725 if (Subtarget.hasAVX512()) {
8726 if (!Subtarget.hasVLX()) {
8727 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8728 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8730 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8731 DAG, SDLoc(IndicesVec));
8732 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8734 return extract256BitVector(Res, 0, DAG, DL);
8736 Opcode = X86ISD::VPERMV;
8737 } else if (Subtarget.hasAVX()) {
8738 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8740 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8742 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8743 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8744 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8745 if (Subtarget.hasXOP())
8746 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8747 LoLo, HiHi, IndicesVec,
8748 DAG.getConstant(0, DL, MVT::i8)));
8749 // Permute Lo and Hi and then select based on index range.
8750 // This works as VPERMILPD only uses index bit[1] to permute elements.
8751 SDValue Res = DAG.getSelectCC(
8752 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8753 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8754 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8755 ISD::CondCode::SETGT);
8756 return DAG.getBitcast(VT, Res);
8760 if (Subtarget.hasVBMI())
8761 Opcode = X86ISD::VPERMV;
8764 if (Subtarget.hasBWI())
8765 Opcode = X86ISD::VPERMV;
8771 if (Subtarget.hasAVX512())
8772 Opcode = X86ISD::VPERMV;
8778 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8779 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8780 "Illegal variable permute shuffle type");
8782 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8784 IndicesVec = ScaleIndices(IndicesVec, Scale);
8786 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8787 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8789 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8790 SDValue Res = Opcode == X86ISD::VPERMV
8791 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8792 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8793 return DAG.getBitcast(VT, Res);
8796 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8797 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8798 // (build_vector (extract_elt V, (extract_elt I, 0)),
8799 // (extract_elt V, (extract_elt I, 1)),
8804 // TODO: Handle undefs
8805 // TODO: Utilize pshufb and zero mask blending to support more efficient
8806 // construction of vectors with constant-0 elements.
8808 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8809 const X86Subtarget &Subtarget) {
8810 SDValue SrcVec, IndicesVec;
8811 // Check for a match of the permute source vector and permute index elements.
8812 // This is done by checking that the i-th build_vector operand is of the form:
8813 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8814 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8815 SDValue Op = V.getOperand(Idx);
8816 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8819 // If this is the first extract encountered in V, set the source vector,
8820 // otherwise verify the extract is from the previously defined source
8823 SrcVec = Op.getOperand(0);
8824 else if (SrcVec != Op.getOperand(0))
8826 SDValue ExtractedIndex = Op->getOperand(1);
8827 // Peek through extends.
8828 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8829 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8830 ExtractedIndex = ExtractedIndex.getOperand(0);
8831 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8834 // If this is the first extract from the index vector candidate, set the
8835 // indices vector, otherwise verify the extract is from the previously
8836 // defined indices vector.
8838 IndicesVec = ExtractedIndex.getOperand(0);
8839 else if (IndicesVec != ExtractedIndex.getOperand(0))
8842 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8843 if (!PermIdx || PermIdx->getZExtValue() != Idx)
8848 MVT VT = V.getSimpleValueType();
8849 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8853 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8856 MVT VT = Op.getSimpleValueType();
8857 MVT EltVT = VT.getVectorElementType();
8858 unsigned NumElems = Op.getNumOperands();
8860 // Generate vectors for predicate vectors.
8861 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8862 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8864 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8865 return VectorConstant;
8867 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8868 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8870 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8871 return HorizontalOp;
8872 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8874 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
8877 unsigned EVTBits = EltVT.getSizeInBits();
8879 unsigned NumZero = 0;
8880 unsigned NumNonZero = 0;
8881 uint64_t NonZeros = 0;
8882 bool IsAllConstants = true;
8883 SmallSet<SDValue, 8> Values;
8884 unsigned NumConstants = NumElems;
8885 for (unsigned i = 0; i < NumElems; ++i) {
8886 SDValue Elt = Op.getOperand(i);
8890 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
8891 IsAllConstants = false;
8894 if (X86::isZeroNode(Elt))
8897 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
8898 NonZeros |= ((uint64_t)1 << i);
8903 // All undef vector. Return an UNDEF. All zero vectors were handled above.
8904 if (NumNonZero == 0)
8905 return DAG.getUNDEF(VT);
8907 // If we are inserting one variable into a vector of non-zero constants, try
8908 // to avoid loading each constant element as a scalar. Load the constants as a
8909 // vector and then insert the variable scalar element. If insertion is not
8910 // supported, fall back to a shuffle to get the scalar blended with the
8911 // constants. Insertion into a zero vector is handled as a special-case
8912 // somewhere below here.
8913 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8914 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8915 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8916 // Create an all-constant vector. The variable element in the old
8917 // build vector is replaced by undef in the constant vector. Save the
8918 // variable scalar element and its index for use in the insertelement.
8919 LLVMContext &Context = *DAG.getContext();
8920 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8921 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8924 for (unsigned i = 0; i != NumElems; ++i) {
8925 SDValue Elt = Op.getOperand(i);
8926 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8927 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8928 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8929 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8930 else if (!Elt.isUndef()) {
8931 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8932 "Expected one variable element in this vector");
8934 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
8937 Constant *CV = ConstantVector::get(ConstVecOps);
8938 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8940 // The constants we just created may not be legal (eg, floating point). We
8941 // must lower the vector right here because we can not guarantee that we'll
8942 // legalize it before loading it. This is also why we could not just create
8943 // a new build vector here. If the build vector contains illegal constants,
8944 // it could get split back up into a series of insert elements.
8945 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8946 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8947 MachineFunction &MF = DAG.getMachineFunction();
8948 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8949 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8950 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
8951 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8952 if (InsertC < NumEltsInLow128Bits)
8953 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8955 // There's no good way to insert into the high elements of a >128-bit
8956 // vector, so use shuffles to avoid an extract/insert sequence.
8957 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8958 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8959 SmallVector<int, 8> ShuffleMask;
8960 unsigned NumElts = VT.getVectorNumElements();
8961 for (unsigned i = 0; i != NumElts; ++i)
8962 ShuffleMask.push_back(i == InsertC ? NumElts : i);
8963 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8964 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8967 // Special case for single non-zero, non-undef, element.
8968 if (NumNonZero == 1) {
8969 unsigned Idx = countTrailingZeros(NonZeros);
8970 SDValue Item = Op.getOperand(Idx);
8972 // If we have a constant or non-constant insertion into the low element of
8973 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8974 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8975 // depending on what the source datatype is.
8978 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8980 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
8981 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
8982 assert((VT.is128BitVector() || VT.is256BitVector() ||
8983 VT.is512BitVector()) &&
8984 "Expected an SSE value type!");
8985 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8986 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8987 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8990 // We can't directly insert an i8 or i16 into a vector, so zero extend
8992 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8993 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8994 if (VT.getSizeInBits() >= 256) {
8995 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8996 if (Subtarget.hasAVX()) {
8997 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8998 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9000 // Without AVX, we need to extend to a 128-bit vector and then
9001 // insert into the 256-bit vector.
9002 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
9003 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
9004 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
9007 assert(VT.is128BitVector() && "Expected an SSE value type!");
9008 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
9009 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9011 return DAG.getBitcast(VT, Item);
9015 // Is it a vector logical left shift?
9016 if (NumElems == 2 && Idx == 1 &&
9017 X86::isZeroNode(Op.getOperand(0)) &&
9018 !X86::isZeroNode(Op.getOperand(1))) {
9019 unsigned NumBits = VT.getSizeInBits();
9020 return getVShift(true, VT,
9021 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
9022 VT, Op.getOperand(1)),
9023 NumBits/2, DAG, *this, dl);
9026 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9029 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9030 // is a non-constant being inserted into an element other than the low one,
9031 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9032 // movd/movss) to move this into the low element, then shuffle it into
9034 if (EVTBits == 32) {
9035 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9036 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9040 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9041 if (Values.size() == 1) {
9042 if (EVTBits == 32) {
9043 // Instead of a shuffle like this:
9044 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9045 // Check if it's possible to issue this instead.
9046 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9047 unsigned Idx = countTrailingZeros(NonZeros);
9048 SDValue Item = Op.getOperand(Idx);
9049 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9050 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9055 // A vector full of immediates; various special cases are already
9056 // handled, so this is best done with a single constant-pool load.
9060 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
9063 // See if we can use a vector load to get all of the elements.
9065 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9067 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9071 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9072 // build_vector and broadcast it.
9073 // TODO: We could probably generalize this more.
9074 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9075 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9076 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9077 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9078 // Make sure all the even/odd operands match.
9079 for (unsigned i = 2; i != NumElems; ++i)
9080 if (Ops[i % 2] != Op.getOperand(i))
9084 if (CanSplat(Op, NumElems, Ops)) {
9085 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9086 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9087 // Create a new build vector and cast to v2i64/v2f64.
9088 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9089 DAG.getBuildVector(NarrowVT, dl, Ops));
9090 // Broadcast from v2i64/v2f64 and cast to final VT.
9091 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
9092 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9097 // For AVX-length vectors, build the individual 128-bit pieces and use
9098 // shuffles to put them in place.
9099 if (VT.getSizeInBits() > 128) {
9100 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
9102 // Build both the lower and upper subvector.
9104 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9105 SDValue Upper = DAG.getBuildVector(
9106 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9108 // Recreate the wider vector with the lower and upper part.
9109 return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
9110 VT.getSizeInBits() / 2);
9113 // Let legalizer expand 2-wide build_vectors.
9114 if (EVTBits == 64) {
9115 if (NumNonZero == 1) {
9116 // One half is zero or undef.
9117 unsigned Idx = countTrailingZeros(NonZeros);
9118 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9119 Op.getOperand(Idx));
9120 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9125 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9126 if (EVTBits == 8 && NumElems == 16)
9127 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
9131 if (EVTBits == 16 && NumElems == 8)
9132 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
9136 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9137 if (EVTBits == 32 && NumElems == 4)
9138 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
9141 // If element VT is == 32 bits, turn it into a number of shuffles.
9142 if (NumElems == 4 && NumZero > 0) {
9143 SmallVector<SDValue, 8> Ops(NumElems);
9144 for (unsigned i = 0; i < 4; ++i) {
9145 bool isZero = !(NonZeros & (1ULL << i));
9147 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9149 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9152 for (unsigned i = 0; i < 2; ++i) {
9153 switch ((NonZeros >> (i*2)) & 0x3) {
9154 default: llvm_unreachable("Unexpected NonZero count");
9156 Ops[i] = Ops[i*2]; // Must be a zero vector.
9159 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9162 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9165 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9170 bool Reverse1 = (NonZeros & 0x3) == 2;
9171 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
9175 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9176 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9178 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9181 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9183 // Check for a build vector from mostly shuffle plus few inserting.
9184 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
9187 // For SSE 4.1, use insertps to put the high elements into the low element.
9188 if (Subtarget.hasSSE41()) {
9190 if (!Op.getOperand(0).isUndef())
9191 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9193 Result = DAG.getUNDEF(VT);
9195 for (unsigned i = 1; i < NumElems; ++i) {
9196 if (Op.getOperand(i).isUndef()) continue;
9197 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9198 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
9203 // Otherwise, expand into a number of unpckl*, start by extending each of
9204 // our (non-undef) elements to the full vector width with the element in the
9205 // bottom slot of the vector (which generates no code for SSE).
9206 SmallVector<SDValue, 8> Ops(NumElems);
9207 for (unsigned i = 0; i < NumElems; ++i) {
9208 if (!Op.getOperand(i).isUndef())
9209 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9211 Ops[i] = DAG.getUNDEF(VT);
9214 // Next, we iteratively mix elements, e.g. for v4f32:
9215 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9216 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9217 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9218 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9219 // Generate scaled UNPCKL shuffle mask.
9220 SmallVector<int, 16> Mask;
9221 for(unsigned i = 0; i != Scale; ++i)
9223 for (unsigned i = 0; i != Scale; ++i)
9224 Mask.push_back(NumElems+i);
9225 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9227 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9228 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9233 // 256-bit AVX can use the vinsertf128 instruction
9234 // to create 256-bit vectors from two other 128-bit ones.
9235 // TODO: Detect subvector broadcast here instead of DAG combine?
9236 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
9237 const X86Subtarget &Subtarget) {
9239 MVT ResVT = Op.getSimpleValueType();
9241 assert((ResVT.is256BitVector() ||
9242 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9244 unsigned NumOperands = Op.getNumOperands();
9245 unsigned NumZero = 0;
9246 unsigned NumNonZero = 0;
9247 unsigned NonZeros = 0;
9248 for (unsigned i = 0; i != NumOperands; ++i) {
9249 SDValue SubVec = Op.getOperand(i);
9250 if (SubVec.isUndef())
9252 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9255 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9261 // If we have more than 2 non-zeros, build each half separately.
9262 if (NumNonZero > 2) {
9263 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
9264 ResVT.getVectorNumElements()/2);
9265 ArrayRef<SDUse> Ops = Op->ops();
9266 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9267 Ops.slice(0, NumOperands/2));
9268 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9269 Ops.slice(NumOperands/2));
9270 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9273 // Otherwise, build it up through insert_subvectors.
9274 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9275 : DAG.getUNDEF(ResVT);
9277 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9278 unsigned NumSubElems = SubVT.getVectorNumElements();
9279 for (unsigned i = 0; i != NumOperands; ++i) {
9280 if ((NonZeros & (1 << i)) == 0)
9283 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9285 DAG.getIntPtrConstant(i * NumSubElems, dl));
9291 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
9292 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
9293 static bool isExpandWithZeros(const SDValue &Op) {
9294 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
9295 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
9297 for (unsigned i = 1; i < Op.getNumOperands(); i++)
9298 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
9304 // Returns true if the given node is a type promotion (by concatenating i1
9305 // zeros) of the result of a node that already zeros all upper bits of
9307 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
9308 unsigned Opc = Op.getOpcode();
9310 assert(Opc == ISD::CONCAT_VECTORS &&
9311 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
9312 "Unexpected node to check for type promotion!");
9314 // As long as we are concatenating zeros to the upper part of a previous node
9315 // result, climb up the tree until a node with different opcode is
9317 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
9318 if (Opc == ISD::INSERT_SUBVECTOR) {
9319 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
9320 Op.getConstantOperandVal(2) == 0)
9321 Op = Op.getOperand(1);
9324 } else { // Opc == ISD::CONCAT_VECTORS
9325 if (isExpandWithZeros(Op))
9326 Op = Op.getOperand(0);
9330 Opc = Op.getOpcode();
9333 // Check if the first inserted node zeroes the upper bits, or an 'and' result
9334 // of a node that zeros the upper bits (its masked version).
9335 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
9336 (Op.getOpcode() == ISD::AND &&
9337 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
9338 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
9345 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
9346 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
9347 const X86Subtarget &Subtarget,
9348 SelectionDAG & DAG) {
9350 MVT ResVT = Op.getSimpleValueType();
9351 unsigned NumOperands = Op.getNumOperands();
9353 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9354 "Unexpected number of operands in CONCAT_VECTORS");
9356 // If this node promotes - by concatenating zeroes - the type of the result
9357 // of a node with instruction that zeroes all upper (irrelevant) bits of the
9358 // output register, mark it as legal and catch the pattern in instruction
9359 // selection to avoid emitting extra instructions (for zeroing upper bits).
9360 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
9361 return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
9363 unsigned NumZero = 0;
9364 unsigned NumNonZero = 0;
9365 uint64_t NonZeros = 0;
9366 for (unsigned i = 0; i != NumOperands; ++i) {
9367 SDValue SubVec = Op.getOperand(i);
9368 if (SubVec.isUndef())
9370 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9373 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9374 NonZeros |= (uint64_t)1 << i;
9380 // If there are zero or one non-zeros we can handle this very simply.
9381 if (NumNonZero <= 1) {
9382 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9383 : DAG.getUNDEF(ResVT);
9386 unsigned Idx = countTrailingZeros(NonZeros);
9387 SDValue SubVec = Op.getOperand(Idx);
9388 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9389 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9390 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9393 if (NumOperands > 2) {
9394 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
9395 ResVT.getVectorNumElements()/2);
9396 ArrayRef<SDUse> Ops = Op->ops();
9397 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9398 Ops.slice(0, NumOperands/2));
9399 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9400 Ops.slice(NumOperands/2));
9401 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9404 assert(NumNonZero == 2 && "Simple cases not handled?");
9406 if (ResVT.getVectorNumElements() >= 16)
9407 return Op; // The operation is legal with KUNPCK
9409 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9410 DAG.getUNDEF(ResVT), Op.getOperand(0),
9411 DAG.getIntPtrConstant(0, dl));
9412 unsigned NumElems = ResVT.getVectorNumElements();
9413 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9414 DAG.getIntPtrConstant(NumElems/2, dl));
9417 static SDValue LowerCONCAT_VECTORS(SDValue Op,
9418 const X86Subtarget &Subtarget,
9419 SelectionDAG &DAG) {
9420 MVT VT = Op.getSimpleValueType();
9421 if (VT.getVectorElementType() == MVT::i1)
9422 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9424 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9425 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9426 Op.getNumOperands() == 4)));
9428 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9429 // from two other 128-bit ones.
9431 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9432 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9435 //===----------------------------------------------------------------------===//
9436 // Vector shuffle lowering
9438 // This is an experimental code path for lowering vector shuffles on x86. It is
9439 // designed to handle arbitrary vector shuffles and blends, gracefully
9440 // degrading performance as necessary. It works hard to recognize idiomatic
9441 // shuffles and lower them to optimal instruction patterns without leaving
9442 // a framework that allows reasonably efficient handling of all vector shuffle
9444 //===----------------------------------------------------------------------===//
9446 /// Tiny helper function to identify a no-op mask.
9448 /// This is a somewhat boring predicate function. It checks whether the mask
9449 /// array input, which is assumed to be a single-input shuffle mask of the kind
9450 /// used by the X86 shuffle instructions (not a fully general
9451 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9452 /// in-place shuffle are 'no-op's.
9453 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
9454 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9455 assert(Mask[i] >= -1 && "Out of bound mask element!");
9456 if (Mask[i] >= 0 && Mask[i] != i)
9462 /// Test whether there are elements crossing 128-bit lanes in this
9465 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9466 /// and we routinely test for these.
9467 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
9468 int LaneSize = 128 / VT.getScalarSizeInBits();
9469 int Size = Mask.size();
9470 for (int i = 0; i < Size; ++i)
9471 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9476 /// Test whether a shuffle mask is equivalent within each sub-lane.
9478 /// This checks a shuffle mask to see if it is performing the same
9479 /// lane-relative shuffle in each sub-lane. This trivially implies
9480 /// that it is also not lane-crossing. It may however involve a blend from the
9481 /// same lane of a second vector.
9483 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9484 /// non-trivial to compute in the face of undef lanes. The representation is
9485 /// suitable for use with existing 128-bit shuffles as entries from the second
9486 /// vector have been remapped to [LaneSize, 2*LaneSize).
9487 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9489 SmallVectorImpl<int> &RepeatedMask) {
9490 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9491 RepeatedMask.assign(LaneSize, -1);
9492 int Size = Mask.size();
9493 for (int i = 0; i < Size; ++i) {
9494 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9497 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9498 // This entry crosses lanes, so there is no way to model this shuffle.
9501 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9502 // Adjust second vector indices to start at LaneSize instead of Size.
9503 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9504 : Mask[i] % LaneSize + LaneSize;
9505 if (RepeatedMask[i % LaneSize] < 0)
9506 // This is the first non-undef entry in this slot of a 128-bit lane.
9507 RepeatedMask[i % LaneSize] = LocalM;
9508 else if (RepeatedMask[i % LaneSize] != LocalM)
9509 // Found a mismatch with the repeated mask.
9515 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
9517 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
9518 SmallVectorImpl<int> &RepeatedMask) {
9519 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9523 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
9524 SmallVector<int, 32> RepeatedMask;
9525 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9528 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
9530 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
9531 SmallVectorImpl<int> &RepeatedMask) {
9532 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9535 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9536 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9537 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9539 SmallVectorImpl<int> &RepeatedMask) {
9540 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9541 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9542 int Size = Mask.size();
9543 for (int i = 0; i < Size; ++i) {
9544 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9545 if (Mask[i] == SM_SentinelUndef)
9547 if (Mask[i] == SM_SentinelZero) {
9548 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9550 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9553 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9554 // This entry crosses lanes, so there is no way to model this shuffle.
9557 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9558 // Adjust second vector indices to start at LaneSize instead of Size.
9560 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
9561 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9562 // This is the first non-undef entry in this slot of a 128-bit lane.
9563 RepeatedMask[i % LaneSize] = LocalM;
9564 else if (RepeatedMask[i % LaneSize] != LocalM)
9565 // Found a mismatch with the repeated mask.
9571 /// Checks whether a shuffle mask is equivalent to an explicit list of
9574 /// This is a fast way to test a shuffle mask against a fixed pattern:
9576 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9578 /// It returns true if the mask is exactly as wide as the argument list, and
9579 /// each element of the mask is either -1 (signifying undef) or the value given
9580 /// in the argument.
9581 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
9582 ArrayRef<int> ExpectedMask) {
9583 if (Mask.size() != ExpectedMask.size())
9586 int Size = Mask.size();
9588 // If the values are build vectors, we can look through them to find
9589 // equivalent inputs that make the shuffles equivalent.
9590 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
9591 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
9593 for (int i = 0; i < Size; ++i) {
9594 assert(Mask[i] >= -1 && "Out of bound mask element!");
9595 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
9596 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
9597 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
9598 if (!MaskBV || !ExpectedBV ||
9599 MaskBV->getOperand(Mask[i] % Size) !=
9600 ExpectedBV->getOperand(ExpectedMask[i] % Size))
9608 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9610 /// The masks must be exactly the same width.
9612 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9613 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
9615 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
9616 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
9617 ArrayRef<int> ExpectedMask) {
9618 int Size = Mask.size();
9619 if (Size != (int)ExpectedMask.size())
9622 for (int i = 0; i < Size; ++i)
9623 if (Mask[i] == SM_SentinelUndef)
9625 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
9627 else if (Mask[i] != ExpectedMask[i])
9633 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
9635 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
9636 const APInt &Zeroable) {
9637 int NumElts = Mask.size();
9638 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
9640 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
9641 for (int i = 0; i != NumElts; ++i) {
9643 if (M == SM_SentinelUndef)
9645 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
9646 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
9651 // Attempt to create a shuffle mask from a VSELECT condition mask.
9652 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
9654 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
9657 unsigned Size = Cond.getValueType().getVectorNumElements();
9658 Mask.resize(Size, SM_SentinelUndef);
9660 for (int i = 0; i != (int)Size; ++i) {
9661 SDValue CondElt = Cond.getOperand(i);
9663 // Arbitrarily choose from the 2nd operand if the select condition element
9665 // TODO: Can we do better by matching patterns such as even/odd?
9666 if (CondElt.isUndef() || isNullConstant(CondElt))
9673 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9675 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
9676 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9679 SmallVector<int, 8> Unpcklwd;
9680 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9681 /* Unary = */ false);
9682 SmallVector<int, 8> Unpckhwd;
9683 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9684 /* Unary = */ false);
9685 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
9686 isTargetShuffleEquivalent(Mask, Unpckhwd));
9687 return IsUnpackwdMask;
9690 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9692 /// This helper function produces an 8-bit shuffle immediate corresponding to
9693 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9694 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9697 /// NB: We rely heavily on "undef" masks preserving the input lane.
9698 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9699 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9700 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9701 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9702 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9703 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9706 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9707 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9708 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9709 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9713 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9714 SelectionDAG &DAG) {
9715 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9718 /// Compute whether each element of a shuffle is zeroable.
9720 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
9721 /// Either it is an undef element in the shuffle mask, the element of the input
9722 /// referenced is undef, or the element of the input referenced is known to be
9723 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
9724 /// as many lanes with this technique as possible to simplify the remaining
9726 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
9727 SDValue V1, SDValue V2) {
9728 APInt Zeroable(Mask.size(), 0);
9729 V1 = peekThroughBitcasts(V1);
9730 V2 = peekThroughBitcasts(V2);
9732 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
9733 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
9735 int VectorSizeInBits = V1.getValueSizeInBits();
9736 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
9737 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
9739 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9741 // Handle the easy cases.
9742 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
9747 // Determine shuffle input and normalize the mask.
9748 SDValue V = M < Size ? V1 : V2;
9751 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
9752 if (V.getOpcode() != ISD::BUILD_VECTOR)
9755 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
9756 // the (larger) source element must be UNDEF/ZERO.
9757 if ((Size % V.getNumOperands()) == 0) {
9758 int Scale = Size / V->getNumOperands();
9759 SDValue Op = V.getOperand(M / Scale);
9760 if (Op.isUndef() || X86::isZeroNode(Op))
9762 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
9763 APInt Val = Cst->getAPIntValue();
9764 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9765 Val = Val.getLoBits(ScalarSizeInBits);
9768 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
9769 APInt Val = Cst->getValueAPF().bitcastToAPInt();
9770 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9771 Val = Val.getLoBits(ScalarSizeInBits);
9778 // If the BUILD_VECTOR has more elements then all the (smaller) source
9779 // elements must be UNDEF or ZERO.
9780 if ((V.getNumOperands() % Size) == 0) {
9781 int Scale = V->getNumOperands() / Size;
9782 bool AllZeroable = true;
9783 for (int j = 0; j < Scale; ++j) {
9784 SDValue Op = V.getOperand((M * Scale) + j);
9785 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
9796 // The Shuffle result is as follow:
9797 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9798 // Each Zeroable's element correspond to a particular Mask's element.
9799 // As described in computeZeroableShuffleElements function.
9801 // The function looks for a sub-mask that the nonzero elements are in
9802 // increasing order. If such sub-mask exist. The function returns true.
9803 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9804 ArrayRef<int> Mask, const EVT &VectorType,
9805 bool &IsZeroSideLeft) {
9806 int NextElement = -1;
9807 // Check if the Mask's nonzero elements are in increasing order.
9808 for (int i = 0, e = Mask.size(); i < e; i++) {
9809 // Checks if the mask's zeros elements are built from only zeros.
9810 assert(Mask[i] >= -1 && "Out of bound mask element!");
9815 // Find the lowest non zero element
9816 if (NextElement < 0) {
9817 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9818 IsZeroSideLeft = NextElement != 0;
9820 // Exit if the mask's non zero elements are not in increasing order.
9821 if (NextElement != Mask[i])
9828 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9829 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9830 ArrayRef<int> Mask, SDValue V1,
9832 const APInt &Zeroable,
9833 const X86Subtarget &Subtarget,
9834 SelectionDAG &DAG) {
9835 int Size = Mask.size();
9836 int LaneSize = 128 / VT.getScalarSizeInBits();
9837 const int NumBytes = VT.getSizeInBits() / 8;
9838 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9840 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9841 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9842 (Subtarget.hasBWI() && VT.is512BitVector()));
9844 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9845 // Sign bit set in i8 mask means zero element.
9846 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9849 for (int i = 0; i < NumBytes; ++i) {
9850 int M = Mask[i / NumEltBytes];
9852 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9855 if (Zeroable[i / NumEltBytes]) {
9856 PSHUFBMask[i] = ZeroMask;
9860 // We can only use a single input of V1 or V2.
9861 SDValue SrcV = (M >= Size ? V2 : V1);
9867 // PSHUFB can't cross lanes, ensure this doesn't happen.
9868 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9872 M = M * NumEltBytes + (i % NumEltBytes);
9873 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9875 assert(V && "Failed to find a source input");
9877 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9878 return DAG.getBitcast(
9879 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9880 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9883 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9884 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9887 // X86 has dedicated shuffle that can be lowered to VEXPAND
9888 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
9889 const APInt &Zeroable,
9890 ArrayRef<int> Mask, SDValue &V1,
9891 SDValue &V2, SelectionDAG &DAG,
9892 const X86Subtarget &Subtarget) {
9893 bool IsLeftZeroSide = true;
9894 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9897 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9899 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9900 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9901 unsigned NumElts = VT.getVectorNumElements();
9902 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9903 "Unexpected number of vector elements");
9904 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9905 Subtarget, DAG, DL);
9906 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9907 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9908 return DAG.getSelect(DL, VT, VMask,
9909 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
9913 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9914 unsigned &UnpackOpcode, bool IsUnary,
9915 ArrayRef<int> TargetMask,
9916 const SDLoc &DL, SelectionDAG &DAG,
9917 const X86Subtarget &Subtarget) {
9918 int NumElts = VT.getVectorNumElements();
9920 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9921 for (int i = 0; i != NumElts; i += 2) {
9922 int M1 = TargetMask[i + 0];
9923 int M2 = TargetMask[i + 1];
9924 Undef1 &= (SM_SentinelUndef == M1);
9925 Undef2 &= (SM_SentinelUndef == M2);
9926 Zero1 &= isUndefOrZero(M1);
9927 Zero2 &= isUndefOrZero(M2);
9929 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9930 "Zeroable shuffle detected");
9932 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9933 SmallVector<int, 64> Unpckl, Unpckh;
9934 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9935 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9936 UnpackOpcode = X86ISD::UNPCKL;
9937 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9938 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9942 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9943 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9944 UnpackOpcode = X86ISD::UNPCKH;
9945 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9946 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9950 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9951 if (IsUnary && (Zero1 || Zero2)) {
9952 // Don't bother if we can blend instead.
9953 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9954 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9957 bool MatchLo = true, MatchHi = true;
9958 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9959 int M = TargetMask[i];
9961 // Ignore if the input is known to be zero or the index is undef.
9962 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9963 (M == SM_SentinelUndef))
9966 MatchLo &= (M == Unpckl[i]);
9967 MatchHi &= (M == Unpckh[i]);
9970 if (MatchLo || MatchHi) {
9971 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9972 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9973 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9978 // If a binary shuffle, commute and try again.
9980 ShuffleVectorSDNode::commuteMask(Unpckl);
9981 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9982 UnpackOpcode = X86ISD::UNPCKL;
9987 ShuffleVectorSDNode::commuteMask(Unpckh);
9988 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9989 UnpackOpcode = X86ISD::UNPCKH;
9998 // X86 has dedicated unpack instructions that can handle specific blend
9999 // operations: UNPCKH and UNPCKL.
10000 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
10001 ArrayRef<int> Mask, SDValue V1,
10002 SDValue V2, SelectionDAG &DAG) {
10003 SmallVector<int, 8> Unpckl;
10004 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10005 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
10006 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10008 SmallVector<int, 8> Unpckh;
10009 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10010 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
10011 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10013 // Commute and try again.
10014 ShuffleVectorSDNode::commuteMask(Unpckl);
10015 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
10016 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10018 ShuffleVectorSDNode::commuteMask(Unpckh);
10019 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
10020 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10025 static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
10027 int Size = (int)Mask.size();
10028 int Split = Size / Delta;
10029 int TruncatedVectorStart = SwappedOps ? Size : 0;
10031 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
10032 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
10035 // The rest of the mask should not refer to the truncated vector's elements.
10036 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
10037 TruncatedVectorStart + Size))
10043 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10045 // An example is the following:
10047 // t0: ch = EntryToken
10048 // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10049 // t25: v4i32 = truncate t2
10050 // t41: v8i16 = bitcast t25
10051 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10052 // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10053 // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10054 // t18: v2i64 = bitcast t51
10056 // Without avx512vl, this is lowered to:
10058 // vpmovqd %zmm0, %ymm0
10059 // vpshufb {{.*#+}} xmm0 =
10060 // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
10062 // But when avx512vl is available, one can just use a single vpmovdw
10064 static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
10065 MVT VT, SDValue V1, SDValue V2,
10067 const X86Subtarget &Subtarget) {
10068 if (VT != MVT::v16i8 && VT != MVT::v8i16)
10071 if (Mask.size() != VT.getVectorNumElements())
10074 bool SwappedOps = false;
10076 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
10077 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
10086 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
10087 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
10089 // and similar ones.
10090 if (V1.getOpcode() != ISD::BITCAST)
10092 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
10095 SDValue Src = V1.getOperand(0).getOperand(0);
10096 MVT SrcVT = Src.getSimpleValueType();
10098 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
10099 // are only available with avx512vl.
10100 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
10103 // Down Convert Word to Byte is only available with avx512bw. The case with
10104 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
10105 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
10106 !Subtarget.hasBWI())
10109 // The first half/quarter of the mask should refer to every second/fourth
10110 // element of the vector truncated and bitcasted.
10111 if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
10112 !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
10115 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
10118 // X86 has dedicated pack instructions that can handle specific truncation
10119 // operations: PACKSS and PACKUS.
10120 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
10121 SDValue &V2, unsigned &PackOpcode,
10122 ArrayRef<int> TargetMask,
10124 const X86Subtarget &Subtarget) {
10125 unsigned NumElts = VT.getVectorNumElements();
10126 unsigned BitSize = VT.getScalarSizeInBits();
10127 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
10128 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
10130 auto MatchPACK = [&](SDValue N1, SDValue N2) {
10131 SDValue VV1 = DAG.getBitcast(PackVT, N1);
10132 SDValue VV2 = DAG.getBitcast(PackVT, N2);
10133 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
10134 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
10135 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
10136 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
10140 PackOpcode = X86ISD::PACKUS;
10144 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
10145 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
10149 PackOpcode = X86ISD::PACKSS;
10155 // Try binary shuffle.
10156 SmallVector<int, 32> BinaryMask;
10157 createPackShuffleMask(VT, BinaryMask, false);
10158 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
10159 if (MatchPACK(V1, V2))
10162 // Try unary shuffle.
10163 SmallVector<int, 32> UnaryMask;
10164 createPackShuffleMask(VT, UnaryMask, true);
10165 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
10166 if (MatchPACK(V1, V1))
10172 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
10173 ArrayRef<int> Mask, SDValue V1,
10174 SDValue V2, SelectionDAG &DAG,
10175 const X86Subtarget &Subtarget) {
10177 unsigned PackOpcode;
10178 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10180 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
10181 DAG.getBitcast(PackVT, V2));
10186 /// Try to emit a bitmask instruction for a shuffle.
10188 /// This handles cases where we can model a blend exactly as a bitmask due to
10189 /// one of the inputs being zeroable.
10190 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
10191 SDValue V2, ArrayRef<int> Mask,
10192 const APInt &Zeroable,
10193 SelectionDAG &DAG) {
10194 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
10195 MVT EltVT = VT.getVectorElementType();
10196 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10197 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10198 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10200 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10203 if (Mask[i] % Size != i)
10204 return SDValue(); // Not a blend.
10206 V = Mask[i] < Size ? V1 : V2;
10207 else if (V != (Mask[i] < Size ? V1 : V2))
10208 return SDValue(); // Can only let one input through the mask.
10210 VMaskOps[i] = AllOnes;
10213 return SDValue(); // No non-zeroable elements!
10215 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
10216 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
10219 /// Try to emit a blend instruction for a shuffle using bit math.
10221 /// This is used as a fallback approach when first class blend instructions are
10222 /// unavailable. Currently it is only suitable for integer vectors, but could
10223 /// be generalized for floating point vectors if desirable.
10224 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
10225 SDValue V2, ArrayRef<int> Mask,
10226 SelectionDAG &DAG) {
10227 assert(VT.isInteger() && "Only supports integer vector types!");
10228 MVT EltVT = VT.getVectorElementType();
10229 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10230 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10231 SmallVector<SDValue, 16> MaskOps;
10232 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10233 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10234 return SDValue(); // Shuffled input!
10235 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10238 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10239 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
10240 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
10241 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
10244 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
10245 SDValue PreservedSrc,
10246 const X86Subtarget &Subtarget,
10247 SelectionDAG &DAG);
10249 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
10250 MutableArrayRef<int> TargetMask,
10251 bool &ForceV1Zero, bool &ForceV2Zero,
10252 uint64_t &BlendMask) {
10253 bool V1IsZeroOrUndef =
10254 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
10255 bool V2IsZeroOrUndef =
10256 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10259 ForceV1Zero = false, ForceV2Zero = false;
10260 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
10262 // Attempt to generate the binary blend mask. If an input is zero then
10263 // we can use any lane.
10264 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
10265 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
10266 int M = TargetMask[i];
10267 if (M == SM_SentinelUndef)
10271 if (M == i + Size) {
10272 BlendMask |= 1ull << i;
10275 if (M == SM_SentinelZero) {
10276 if (V1IsZeroOrUndef) {
10277 ForceV1Zero = true;
10281 if (V2IsZeroOrUndef) {
10282 ForceV2Zero = true;
10283 BlendMask |= 1ull << i;
10284 TargetMask[i] = i + Size;
10293 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
10295 uint64_t ScaledMask = 0;
10296 for (int i = 0; i != Size; ++i)
10297 if (BlendMask & (1ull << i))
10298 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
10302 /// Try to emit a blend instruction for a shuffle.
10304 /// This doesn't do any checks for the availability of instructions for blending
10305 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10306 /// be matched in the backend with the type given. What it does check for is
10307 /// that the shuffle mask is a blend, or convertible into a blend with zero.
10308 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
10309 SDValue V2, ArrayRef<int> Original,
10310 const APInt &Zeroable,
10311 const X86Subtarget &Subtarget,
10312 SelectionDAG &DAG) {
10313 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
10315 uint64_t BlendMask = 0;
10316 bool ForceV1Zero = false, ForceV2Zero = false;
10317 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
10321 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10323 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10325 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10327 switch (VT.SimpleTy) {
10332 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10333 DAG.getConstant(BlendMask, DL, MVT::i8));
10336 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10340 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
10341 // that instruction.
10342 if (Subtarget.hasAVX2()) {
10343 // Scale the blend by the number of 32-bit dwords per element.
10344 int Scale = VT.getScalarSizeInBits() / 32;
10345 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
10346 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
10347 V1 = DAG.getBitcast(BlendVT, V1);
10348 V2 = DAG.getBitcast(BlendVT, V2);
10349 return DAG.getBitcast(
10350 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
10351 DAG.getConstant(BlendMask, DL, MVT::i8)));
10355 // For integer shuffles we need to expand the mask and cast the inputs to
10356 // v8i16s prior to blending.
10357 int Scale = 8 / VT.getVectorNumElements();
10358 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
10359 V1 = DAG.getBitcast(MVT::v8i16, V1);
10360 V2 = DAG.getBitcast(MVT::v8i16, V2);
10361 return DAG.getBitcast(VT,
10362 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
10363 DAG.getConstant(BlendMask, DL, MVT::i8)));
10365 case MVT::v16i16: {
10366 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10367 SmallVector<int, 8> RepeatedMask;
10368 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10369 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10370 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10372 for (int i = 0; i < 8; ++i)
10373 if (RepeatedMask[i] >= 8)
10374 BlendMask |= 1ull << i;
10375 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10376 DAG.getConstant(BlendMask, DL, MVT::i8));
10378 // Use PBLENDW for lower/upper lanes and then blend lanes.
10379 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10380 // merge to VSELECT where useful.
10381 uint64_t LoMask = BlendMask & 0xFF;
10382 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10383 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10384 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10385 DAG.getConstant(LoMask, DL, MVT::i8));
10386 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10387 DAG.getConstant(HiMask, DL, MVT::i8));
10388 return DAG.getVectorShuffle(
10389 MVT::v16i16, DL, Lo, Hi,
10390 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10396 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
10397 "256-bit byte-blends require AVX2 support!");
10399 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10400 if (SDValue Masked =
10401 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
10404 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10406 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10407 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10408 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10411 // Scale the blend by the number of bytes per element.
10412 int Scale = VT.getScalarSizeInBits() / 8;
10414 // This form of blend is always done on bytes. Compute the byte vector
10416 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10418 // x86 allows load folding with blendvb from the 2nd source operand. But
10419 // we are still using LLVM select here (see comment below), so that's V1.
10420 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10421 // allow that load-folding possibility.
10422 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10423 ShuffleVectorSDNode::commuteMask(Mask);
10427 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10428 // mix of LLVM's code generator and the x86 backend. We tell the code
10429 // generator that boolean values in the elements of an x86 vector register
10430 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10431 // mapping a select to operand #1, and 'false' mapping to operand #2. The
10432 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10433 // of the element (the remaining are ignored) and 0 in that high bit would
10434 // mean operand #1 while 1 in the high bit would mean operand #2. So while
10435 // the LLVM model for boolean values in vector elements gets the relevant
10436 // bit set, it is set backwards and over constrained relative to x86's
10438 SmallVector<SDValue, 32> VSELECTMask;
10439 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10440 for (int j = 0; j < Scale; ++j)
10441 VSELECTMask.push_back(
10442 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10443 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10446 V1 = DAG.getBitcast(BlendVT, V1);
10447 V2 = DAG.getBitcast(BlendVT, V2);
10448 return DAG.getBitcast(
10450 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10460 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10461 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10462 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10465 llvm_unreachable("Not a supported integer vector type!");
10469 /// Try to lower as a blend of elements from two inputs followed by
10470 /// a single-input permutation.
10472 /// This matches the pattern where we can blend elements from two inputs and
10473 /// then reduce the shuffle to a single-input permutation.
10474 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
10475 SDValue V1, SDValue V2,
10476 ArrayRef<int> Mask,
10478 bool ImmBlends = false) {
10479 // We build up the blend mask while checking whether a blend is a viable way
10480 // to reduce the shuffle.
10481 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10482 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10484 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10488 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10490 if (BlendMask[Mask[i] % Size] < 0)
10491 BlendMask[Mask[i] % Size] = Mask[i];
10492 else if (BlendMask[Mask[i] % Size] != Mask[i])
10493 return SDValue(); // Can't blend in the needed input!
10495 PermuteMask[i] = Mask[i] % Size;
10498 // If only immediate blends, then bail if the blend mask can't be widened to
10500 unsigned EltSize = VT.getScalarSizeInBits();
10501 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10504 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10505 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10508 /// Try to lower as an unpack of elements from two inputs followed by
10509 /// a single-input permutation.
10511 /// This matches the pattern where we can unpack elements from two inputs and
10512 /// then reduce the shuffle to a single-input (wider) permutation.
10513 static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
10514 SDValue V1, SDValue V2,
10515 ArrayRef<int> Mask,
10516 SelectionDAG &DAG) {
10517 int NumElts = Mask.size();
10518 int NumLanes = VT.getSizeInBits() / 128;
10519 int NumLaneElts = NumElts / NumLanes;
10520 int NumHalfLaneElts = NumLaneElts / 2;
10522 bool MatchLo = true, MatchHi = true;
10523 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10525 // Determine UNPCKL/UNPCKH type and operand order.
10526 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10527 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
10528 int M = Mask[Lane + Elt];
10532 SDValue &Op = Ops[Elt & 1];
10533 if (M < NumElts && (Op.isUndef() || Op == V1))
10535 else if (NumElts <= M && (Op.isUndef() || Op == V2))
10540 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10541 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
10542 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
10543 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
10544 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
10545 if (!MatchLo && !MatchHi)
10549 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10551 // Now check that each pair of elts come from the same unpack pair
10552 // and set the permute mask based on each pair.
10553 // TODO - Investigate cases where we permute individual elements.
10554 SmallVector<int, 32> PermuteMask(NumElts, -1);
10555 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10556 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
10557 int M0 = Mask[Lane + Elt + 0];
10558 int M1 = Mask[Lane + Elt + 1];
10559 if (0 <= M0 && 0 <= M1 &&
10560 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
10563 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
10565 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
10569 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10570 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10571 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10574 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
10575 /// permuting the elements of the result in place.
10576 static SDValue lowerVectorShuffleAsByteRotateAndPermute(
10577 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10578 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10579 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
10580 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
10581 (VT.is512BitVector() && !Subtarget.hasBWI()))
10584 // We don't currently support lane crossing permutes.
10585 if (is128BitLaneCrossingShuffleMask(VT, Mask))
10588 int Scale = VT.getScalarSizeInBits() / 8;
10589 int NumLanes = VT.getSizeInBits() / 128;
10590 int NumElts = VT.getVectorNumElements();
10591 int NumEltsPerLane = NumElts / NumLanes;
10593 // Determine range of mask elts.
10594 bool Blend1 = true;
10595 bool Blend2 = true;
10596 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
10597 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
10598 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
10599 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
10600 int M = Mask[Lane + Elt];
10604 Blend1 &= (M == (Lane + Elt));
10605 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
10606 M = M % NumEltsPerLane;
10607 Range1.first = std::min(Range1.first, M);
10608 Range1.second = std::max(Range1.second, M);
10611 Blend2 &= (M == (Lane + Elt));
10612 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
10613 M = M % NumEltsPerLane;
10614 Range2.first = std::min(Range2.first, M);
10615 Range2.second = std::max(Range2.second, M);
10620 // Bail if we don't need both elements.
10621 // TODO - it might be worth doing this for unary shuffles if the permute
10623 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
10624 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
10627 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
10630 // Rotate the 2 ops so we can access both ranges, then permute the result.
10631 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
10632 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10633 SDValue Rotate = DAG.getBitcast(
10634 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
10635 DAG.getBitcast(ByteVT, Lo),
10636 DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
10637 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
10638 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
10639 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
10640 int M = Mask[Lane + Elt];
10644 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
10646 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
10649 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
10652 // Check if the ranges are small enough to rotate from either direction.
10653 if (Range2.second < Range1.first)
10654 return RotateAndPermute(V1, V2, Range1.first, 0);
10655 if (Range1.second < Range2.first)
10656 return RotateAndPermute(V2, V1, Range2.first, NumElts);
10660 /// Generic routine to decompose a shuffle and blend into independent
10661 /// blends and permutes.
10663 /// This matches the extremely common pattern for handling combined
10664 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
10665 /// operations. It will try to pick the best arrangement of shuffles and
10667 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
10668 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10669 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10670 // Shuffle the input elements into the desired positions in V1 and V2 and
10671 // blend them together.
10672 SmallVector<int, 32> V1Mask(Mask.size(), -1);
10673 SmallVector<int, 32> V2Mask(Mask.size(), -1);
10674 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10675 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10676 if (Mask[i] >= 0 && Mask[i] < Size) {
10677 V1Mask[i] = Mask[i];
10679 } else if (Mask[i] >= Size) {
10680 V2Mask[i] = Mask[i] - Size;
10681 BlendMask[i] = i + Size;
10684 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
10685 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
10686 // the shuffle may be able to fold with a load or other benefit. However, when
10687 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
10688 // pre-shuffle first is a better strategy.
10689 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
10690 // Only prefer immediate blends to unpack/rotate.
10691 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10692 DL, VT, V1, V2, Mask, DAG, true))
10694 if (SDValue UnpackPerm =
10695 lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
10697 if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute(
10698 DL, VT, V1, V2, Mask, Subtarget, DAG))
10700 // Unpack/rotate failed - try again with variable blends.
10701 if (SDValue BlendPerm =
10702 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
10706 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10707 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10708 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10711 /// Try to lower a vector shuffle as a rotation.
10713 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
10714 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
10715 ArrayRef<int> Mask) {
10716 int NumElts = Mask.size();
10718 // We need to detect various ways of spelling a rotation:
10719 // [11, 12, 13, 14, 15, 0, 1, 2]
10720 // [-1, 12, 13, 14, -1, -1, 1, -1]
10721 // [-1, -1, -1, -1, -1, -1, 1, 2]
10722 // [ 3, 4, 5, 6, 7, 8, 9, 10]
10723 // [-1, 4, 5, 6, -1, -1, 9, -1]
10724 // [-1, 4, 5, 6, -1, -1, -1, -1]
10727 for (int i = 0; i < NumElts; ++i) {
10729 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
10730 "Unexpected mask index.");
10734 // Determine where a rotated vector would have started.
10735 int StartIdx = i - (M % NumElts);
10737 // The identity rotation isn't interesting, stop.
10740 // If we found the tail of a vector the rotation must be the missing
10741 // front. If we found the head of a vector, it must be how much of the
10743 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
10746 Rotation = CandidateRotation;
10747 else if (Rotation != CandidateRotation)
10748 // The rotations don't match, so we can't match this mask.
10751 // Compute which value this mask is pointing at.
10752 SDValue MaskV = M < NumElts ? V1 : V2;
10754 // Compute which of the two target values this index should be assigned
10755 // to. This reflects whether the high elements are remaining or the low
10756 // elements are remaining.
10757 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
10759 // Either set up this value if we've not encountered it before, or check
10760 // that it remains consistent.
10763 else if (TargetV != MaskV)
10764 // This may be a rotation, but it pulls from the inputs in some
10765 // unsupported interleaving.
10769 // Check that we successfully analyzed the mask, and normalize the results.
10770 assert(Rotation != 0 && "Failed to locate a viable rotation!");
10771 assert((Lo || Hi) && "Failed to find a rotated input vector!");
10783 /// Try to lower a vector shuffle as a byte rotation.
10785 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
10786 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
10787 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
10788 /// try to generically lower a vector shuffle through such an pattern. It
10789 /// does not check for the profitability of lowering either as PALIGNR or
10790 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
10791 /// This matches shuffle vectors that look like:
10793 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
10795 /// Essentially it concatenates V1 and V2, shifts right by some number of
10796 /// elements, and takes the low elements as the result. Note that while this is
10797 /// specified as a *right shift* because x86 is little-endian, it is a *left
10798 /// rotate* of the vector lanes.
10799 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
10800 ArrayRef<int> Mask) {
10801 // Don't accept any shuffles with zero elements.
10802 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
10805 // PALIGNR works on 128-bit lanes.
10806 SmallVector<int, 16> RepeatedMask;
10807 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
10810 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
10814 // PALIGNR rotates bytes, so we need to scale the
10815 // rotation based on how many bytes are in the vector lane.
10816 int NumElts = RepeatedMask.size();
10817 int Scale = 16 / NumElts;
10818 return Rotation * Scale;
10821 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
10822 SDValue V1, SDValue V2,
10823 ArrayRef<int> Mask,
10824 const X86Subtarget &Subtarget,
10825 SelectionDAG &DAG) {
10826 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
10828 SDValue Lo = V1, Hi = V2;
10829 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
10830 if (ByteRotation <= 0)
10833 // Cast the inputs to i8 vector of correct length to match PALIGNR or
10835 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10836 Lo = DAG.getBitcast(ByteVT, Lo);
10837 Hi = DAG.getBitcast(ByteVT, Hi);
10839 // SSSE3 targets can use the palignr instruction.
10840 if (Subtarget.hasSSSE3()) {
10841 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
10842 "512-bit PALIGNR requires BWI instructions");
10843 return DAG.getBitcast(
10844 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
10845 DAG.getConstant(ByteRotation, DL, MVT::i8)));
10848 assert(VT.is128BitVector() &&
10849 "Rotate-based lowering only supports 128-bit lowering!");
10850 assert(Mask.size() <= 16 &&
10851 "Can shuffle at most 16 bytes in a 128-bit vector!");
10852 assert(ByteVT == MVT::v16i8 &&
10853 "SSE2 rotate lowering only needed for v16i8!");
10855 // Default SSE2 implementation
10856 int LoByteShift = 16 - ByteRotation;
10857 int HiByteShift = ByteRotation;
10859 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
10860 DAG.getConstant(LoByteShift, DL, MVT::i8));
10861 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
10862 DAG.getConstant(HiByteShift, DL, MVT::i8));
10863 return DAG.getBitcast(VT,
10864 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
10867 /// Try to lower a vector shuffle as a dword/qword rotation.
10869 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
10870 /// rotation of the concatenation of two vectors; This routine will
10871 /// try to generically lower a vector shuffle through such an pattern.
10873 /// Essentially it concatenates V1 and V2, shifts right by some number of
10874 /// elements, and takes the low elements as the result. Note that while this is
10875 /// specified as a *right shift* because x86 is little-endian, it is a *left
10876 /// rotate* of the vector lanes.
10877 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
10878 SDValue V1, SDValue V2,
10879 ArrayRef<int> Mask,
10880 const X86Subtarget &Subtarget,
10881 SelectionDAG &DAG) {
10882 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
10883 "Only 32-bit and 64-bit elements are supported!");
10885 // 128/256-bit vectors are only supported with VLX.
10886 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
10887 && "VLX required for 128/256-bit vectors");
10889 SDValue Lo = V1, Hi = V2;
10890 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
10894 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
10895 DAG.getConstant(Rotation, DL, MVT::i8));
10898 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
10900 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
10901 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
10902 /// matches elements from one of the input vectors shuffled to the left or
10903 /// right with zeroable elements 'shifted in'. It handles both the strictly
10904 /// bit-wise element shifts and the byte shift across an entire 128-bit double
10905 /// quad word lane.
10907 /// PSHL : (little-endian) left bit shift.
10908 /// [ zz, 0, zz, 2 ]
10909 /// [ -1, 4, zz, -1 ]
10910 /// PSRL : (little-endian) right bit shift.
10911 /// [ 1, zz, 3, zz]
10912 /// [ -1, -1, 7, zz]
10913 /// PSLLDQ : (little-endian) left byte shift
10914 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
10915 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
10916 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
10917 /// PSRLDQ : (little-endian) right byte shift
10918 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
10919 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
10920 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
10921 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
10922 unsigned ScalarSizeInBits,
10923 ArrayRef<int> Mask, int MaskOffset,
10924 const APInt &Zeroable,
10925 const X86Subtarget &Subtarget) {
10926 int Size = Mask.size();
10927 unsigned SizeInBits = Size * ScalarSizeInBits;
10929 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
10930 for (int i = 0; i < Size; i += Scale)
10931 for (int j = 0; j < Shift; ++j)
10932 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
10938 auto MatchShift = [&](int Shift, int Scale, bool Left) {
10939 for (int i = 0; i != Size; i += Scale) {
10940 unsigned Pos = Left ? i + Shift : i;
10941 unsigned Low = Left ? i : i + Shift;
10942 unsigned Len = Scale - Shift;
10943 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
10947 int ShiftEltBits = ScalarSizeInBits * Scale;
10948 bool ByteShift = ShiftEltBits > 64;
10949 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
10950 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
10951 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
10953 // Normalize the scale for byte shifts to still produce an i64 element
10955 Scale = ByteShift ? Scale / 2 : Scale;
10957 // We need to round trip through the appropriate type for the shift.
10958 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
10959 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
10960 : MVT::getVectorVT(ShiftSVT, Size / Scale);
10961 return (int)ShiftAmt;
10964 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
10965 // keep doubling the size of the integer elements up to that. We can
10966 // then shift the elements of the integer vector by whole multiples of
10967 // their width within the elements of the larger integer vector. Test each
10968 // multiple to see if we can find a match with the moved element indices
10969 // and that the shifted in elements are all zeroable.
10970 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
10971 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
10972 for (int Shift = 1; Shift != Scale; ++Shift)
10973 for (bool Left : {true, false})
10974 if (CheckZeros(Shift, Scale, Left)) {
10975 int ShiftAmt = MatchShift(Shift, Scale, Left);
10984 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
10985 SDValue V2, ArrayRef<int> Mask,
10986 const APInt &Zeroable,
10987 const X86Subtarget &Subtarget,
10988 SelectionDAG &DAG) {
10989 int Size = Mask.size();
10990 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10996 // Try to match shuffle against V1 shift.
10997 int ShiftAmt = matchVectorShuffleAsShift(
10998 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
11000 // If V1 failed, try to match shuffle against V2 shift.
11001 if (ShiftAmt < 0) {
11003 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11004 Mask, Size, Zeroable, Subtarget);
11011 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11012 "Illegal integer vector type");
11013 V = DAG.getBitcast(ShiftVT, V);
11014 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11015 DAG.getConstant(ShiftAmt, DL, MVT::i8));
11016 return DAG.getBitcast(VT, V);
11019 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11020 // Remainder of lower half result is zero and upper half is all undef.
11021 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11022 ArrayRef<int> Mask, uint64_t &BitLen,
11023 uint64_t &BitIdx, const APInt &Zeroable) {
11024 int Size = Mask.size();
11025 int HalfSize = Size / 2;
11026 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11027 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
11029 // Upper half must be undefined.
11030 if (!isUndefInRange(Mask, HalfSize, HalfSize))
11033 // Determine the extraction length from the part of the
11034 // lower half that isn't zeroable.
11035 int Len = HalfSize;
11036 for (; Len > 0; --Len)
11037 if (!Zeroable[Len - 1])
11039 assert(Len > 0 && "Zeroable shuffle mask");
11041 // Attempt to match first Len sequential elements from the lower half.
11044 for (int i = 0; i != Len; ++i) {
11046 if (M == SM_SentinelUndef)
11048 SDValue &V = (M < Size ? V1 : V2);
11051 // The extracted elements must start at a valid index and all mask
11052 // elements must be in the lower half.
11053 if (i > M || M >= HalfSize)
11056 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11064 if (!Src || Idx < 0)
11067 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11068 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11069 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11074 // INSERTQ: Extract lowest Len elements from lower half of second source and
11075 // insert over first source, starting at Idx.
11076 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11077 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11078 ArrayRef<int> Mask, uint64_t &BitLen,
11079 uint64_t &BitIdx) {
11080 int Size = Mask.size();
11081 int HalfSize = Size / 2;
11082 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11084 // Upper half must be undefined.
11085 if (!isUndefInRange(Mask, HalfSize, HalfSize))
11088 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11091 // Attempt to match first source from mask before insertion point.
11092 if (isUndefInRange(Mask, 0, Idx)) {
11094 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11096 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11102 // Extend the extraction length looking to match both the insertion of
11103 // the second source and the remaining elements of the first.
11104 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11106 int Len = Hi - Idx;
11108 // Match insertion.
11109 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11111 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11117 // Match the remaining elements of the lower half.
11118 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11120 } else if ((!Base || (Base == V1)) &&
11121 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11123 } else if ((!Base || (Base == V2)) &&
11124 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11131 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11132 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11142 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11143 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
11144 SDValue V2, ArrayRef<int> Mask,
11145 const APInt &Zeroable,
11146 SelectionDAG &DAG) {
11147 uint64_t BitLen, BitIdx;
11148 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11149 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11150 DAG.getConstant(BitLen, DL, MVT::i8),
11151 DAG.getConstant(BitIdx, DL, MVT::i8));
11153 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11154 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11155 V2 ? V2 : DAG.getUNDEF(VT),
11156 DAG.getConstant(BitLen, DL, MVT::i8),
11157 DAG.getConstant(BitIdx, DL, MVT::i8));
11162 /// Lower a vector shuffle as a zero or any extension.
11164 /// Given a specific number of elements, element bit width, and extension
11165 /// stride, produce either a zero or any extension based on the available
11166 /// features of the subtarget. The extended elements are consecutive and
11167 /// begin and can start from an offsetted element index in the input; to
11168 /// avoid excess shuffling the offset must either being in the bottom lane
11169 /// or at the start of a higher lane. All extended elements must be from
11171 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
11172 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11173 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11174 assert(Scale > 1 && "Need a scale to extend.");
11175 int EltBits = VT.getScalarSizeInBits();
11176 int NumElements = VT.getVectorNumElements();
11177 int NumEltsPerLane = 128 / EltBits;
11178 int OffsetLane = Offset / NumEltsPerLane;
11179 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11180 "Only 8, 16, and 32 bit elements can be extended.");
11181 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11182 assert(0 <= Offset && "Extension offset must be positive.");
11183 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11184 "Extension offset must be in the first lane or start an upper lane.");
11186 // Check that an index is in same lane as the base offset.
11187 auto SafeOffset = [&](int Idx) {
11188 return OffsetLane == (Idx / NumEltsPerLane);
11191 // Shift along an input so that the offset base moves to the first element.
11192 auto ShuffleOffset = [&](SDValue V) {
11196 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11197 for (int i = 0; i * Scale < NumElements; ++i) {
11198 int SrcIdx = i + Offset;
11199 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11201 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11204 // Found a valid zext mask! Try various lowering strategies based on the
11205 // input type and available ISA extensions.
11206 if (Subtarget.hasSSE41()) {
11207 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11208 // PUNPCK will catch this in a later shuffle match.
11209 if (Offset && Scale == 2 && VT.is128BitVector())
11211 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11212 NumElements / Scale);
11213 InputV = ShuffleOffset(InputV);
11214 InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG);
11215 return DAG.getBitcast(VT, InputV);
11218 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11220 // For any extends we can cheat for larger element sizes and use shuffle
11221 // instructions that can fold with a load and/or copy.
11222 if (AnyExt && EltBits == 32) {
11223 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11225 return DAG.getBitcast(
11226 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11227 DAG.getBitcast(MVT::v4i32, InputV),
11228 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11230 if (AnyExt && EltBits == 16 && Scale > 2) {
11231 int PSHUFDMask[4] = {Offset / 2, -1,
11232 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11233 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11234 DAG.getBitcast(MVT::v4i32, InputV),
11235 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11236 int PSHUFWMask[4] = {1, -1, -1, -1};
11237 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
11238 return DAG.getBitcast(
11239 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11240 DAG.getBitcast(MVT::v8i16, InputV),
11241 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11244 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11246 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11247 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11248 assert(VT.is128BitVector() && "Unexpected vector width!");
11250 int LoIdx = Offset * EltBits;
11251 SDValue Lo = DAG.getBitcast(
11252 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11253 DAG.getConstant(EltBits, DL, MVT::i8),
11254 DAG.getConstant(LoIdx, DL, MVT::i8)));
11256 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
11257 !SafeOffset(Offset + 1))
11258 return DAG.getBitcast(VT, Lo);
11260 int HiIdx = (Offset + 1) * EltBits;
11261 SDValue Hi = DAG.getBitcast(
11262 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11263 DAG.getConstant(EltBits, DL, MVT::i8),
11264 DAG.getConstant(HiIdx, DL, MVT::i8)));
11265 return DAG.getBitcast(VT,
11266 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
11269 // If this would require more than 2 unpack instructions to expand, use
11270 // pshufb when available. We can only use more than 2 unpack instructions
11271 // when zero extending i8 elements which also makes it easier to use pshufb.
11272 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
11273 assert(NumElements == 16 && "Unexpected byte vector width!");
11274 SDValue PSHUFBMask[16];
11275 for (int i = 0; i < 16; ++i) {
11276 int Idx = Offset + (i / Scale);
11277 PSHUFBMask[i] = DAG.getConstant(
11278 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
11280 InputV = DAG.getBitcast(MVT::v16i8, InputV);
11281 return DAG.getBitcast(
11282 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
11283 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
11286 // If we are extending from an offset, ensure we start on a boundary that
11287 // we can unpack from.
11288 int AlignToUnpack = Offset % (NumElements / Scale);
11289 if (AlignToUnpack) {
11290 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11291 for (int i = AlignToUnpack; i < NumElements; ++i)
11292 ShMask[i - AlignToUnpack] = i;
11293 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
11294 Offset -= AlignToUnpack;
11297 // Otherwise emit a sequence of unpacks.
11299 unsigned UnpackLoHi = X86ISD::UNPCKL;
11300 if (Offset >= (NumElements / 2)) {
11301 UnpackLoHi = X86ISD::UNPCKH;
11302 Offset -= (NumElements / 2);
11305 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
11306 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
11307 : getZeroVector(InputVT, Subtarget, DAG, DL);
11308 InputV = DAG.getBitcast(InputVT, InputV);
11309 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
11313 } while (Scale > 1);
11314 return DAG.getBitcast(VT, InputV);
11317 /// Try to lower a vector shuffle as a zero extension on any microarch.
11319 /// This routine will try to do everything in its power to cleverly lower
11320 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
11321 /// check for the profitability of this lowering, it tries to aggressively
11322 /// match this pattern. It will use all of the micro-architectural details it
11323 /// can to emit an efficient lowering. It handles both blends with all-zero
11324 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
11325 /// masking out later).
11327 /// The reason we have dedicated lowering for zext-style shuffles is that they
11328 /// are both incredibly common and often quite performance sensitive.
11329 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
11330 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11331 const APInt &Zeroable, const X86Subtarget &Subtarget,
11332 SelectionDAG &DAG) {
11333 int Bits = VT.getSizeInBits();
11334 int NumLanes = Bits / 128;
11335 int NumElements = VT.getVectorNumElements();
11336 int NumEltsPerLane = NumElements / NumLanes;
11337 assert(VT.getScalarSizeInBits() <= 32 &&
11338 "Exceeds 32-bit integer zero extension limit");
11339 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
11341 // Define a helper function to check a particular ext-scale and lower to it if
11343 auto Lower = [&](int Scale) -> SDValue {
11345 bool AnyExt = true;
11348 for (int i = 0; i < NumElements; ++i) {
11351 continue; // Valid anywhere but doesn't tell us anything.
11352 if (i % Scale != 0) {
11353 // Each of the extended elements need to be zeroable.
11357 // We no longer are in the anyext case.
11362 // Each of the base elements needs to be consecutive indices into the
11363 // same input vector.
11364 SDValue V = M < NumElements ? V1 : V2;
11365 M = M % NumElements;
11368 Offset = M - (i / Scale);
11369 } else if (InputV != V)
11370 return SDValue(); // Flip-flopping inputs.
11372 // Offset must start in the lowest 128-bit lane or at the start of an
11374 // FIXME: Is it ever worth allowing a negative base offset?
11375 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
11376 (Offset % NumEltsPerLane) == 0))
11379 // If we are offsetting, all referenced entries must come from the same
11381 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
11384 if ((M % NumElements) != (Offset + (i / Scale)))
11385 return SDValue(); // Non-consecutive strided elements.
11389 // If we fail to find an input, we have a zero-shuffle which should always
11390 // have already been handled.
11391 // FIXME: Maybe handle this here in case during blending we end up with one?
11395 // If we are offsetting, don't extend if we only match a single input, we
11396 // can always do better by using a basic PSHUF or PUNPCK.
11397 if (Offset != 0 && Matches < 2)
11400 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
11401 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
11404 // The widest scale possible for extending is to a 64-bit integer.
11405 assert(Bits % 64 == 0 &&
11406 "The number of bits in a vector must be divisible by 64 on x86!");
11407 int NumExtElements = Bits / 64;
11409 // Each iteration, try extending the elements half as much, but into twice as
11411 for (; NumExtElements < NumElements; NumExtElements *= 2) {
11412 assert(NumElements % NumExtElements == 0 &&
11413 "The input vector size must be divisible by the extended size.");
11414 if (SDValue V = Lower(NumElements / NumExtElements))
11418 // General extends failed, but 128-bit vectors may be able to use MOVQ.
11422 // Returns one of the source operands if the shuffle can be reduced to a
11423 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
11424 auto CanZExtLowHalf = [&]() {
11425 for (int i = NumElements / 2; i != NumElements; ++i)
11428 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
11430 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
11435 if (SDValue V = CanZExtLowHalf()) {
11436 V = DAG.getBitcast(MVT::v2i64, V);
11437 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
11438 return DAG.getBitcast(VT, V);
11441 // No viable ext lowering found.
11445 /// Try to get a scalar value for a specific element of a vector.
11447 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
11448 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
11449 SelectionDAG &DAG) {
11450 MVT VT = V.getSimpleValueType();
11451 MVT EltVT = VT.getVectorElementType();
11452 V = peekThroughBitcasts(V);
11454 // If the bitcasts shift the element size, we can't extract an equivalent
11455 // element from it.
11456 MVT NewVT = V.getSimpleValueType();
11457 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
11460 if (V.getOpcode() == ISD::BUILD_VECTOR ||
11461 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
11462 // Ensure the scalar operand is the same size as the destination.
11463 // FIXME: Add support for scalar truncation where possible.
11464 SDValue S = V.getOperand(Idx);
11465 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
11466 return DAG.getBitcast(EltVT, S);
11472 /// Helper to test for a load that can be folded with x86 shuffles.
11474 /// This is particularly important because the set of instructions varies
11475 /// significantly based on whether the operand is a load or not.
11476 static bool isShuffleFoldableLoad(SDValue V) {
11477 V = peekThroughBitcasts(V);
11478 return ISD::isNON_EXTLoad(V.getNode());
11481 /// Try to lower insertion of a single element into a zero vector.
11483 /// This is a common pattern that we have especially efficient patterns to lower
11484 /// across all subtarget feature sets.
11485 static SDValue lowerVectorShuffleAsElementInsertion(
11486 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11487 const APInt &Zeroable, const X86Subtarget &Subtarget,
11488 SelectionDAG &DAG) {
11490 MVT EltVT = VT.getVectorElementType();
11493 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
11495 bool IsV1Zeroable = true;
11496 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11497 if (i != V2Index && !Zeroable[i]) {
11498 IsV1Zeroable = false;
11502 // Check for a single input from a SCALAR_TO_VECTOR node.
11503 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
11504 // all the smarts here sunk into that routine. However, the current
11505 // lowering of BUILD_VECTOR makes that nearly impossible until the old
11506 // vector shuffle lowering is dead.
11507 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
11509 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
11510 // We need to zext the scalar if it is smaller than an i32.
11511 V2S = DAG.getBitcast(EltVT, V2S);
11512 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
11513 // Using zext to expand a narrow element won't work for non-zero
11518 // Zero-extend directly to i32.
11519 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
11520 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
11522 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
11523 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
11524 EltVT == MVT::i16) {
11525 // Either not inserting from the low element of the input or the input
11526 // element size is too small to use VZEXT_MOVL to clear the high bits.
11530 if (!IsV1Zeroable) {
11531 // If V1 can't be treated as a zero vector we have fewer options to lower
11532 // this. We can't support integer vectors or non-zero targets cheaply, and
11533 // the V1 elements can't be permuted in any way.
11534 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
11535 if (!VT.isFloatingPoint() || V2Index != 0)
11537 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
11538 V1Mask[V2Index] = -1;
11539 if (!isNoopShuffleMask(V1Mask))
11541 if (!VT.is128BitVector())
11544 // Otherwise, use MOVSD or MOVSS.
11545 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
11546 "Only two types of floating point element types to handle!");
11547 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
11551 // This lowering only works for the low element with floating point vectors.
11552 if (VT.isFloatingPoint() && V2Index != 0)
11555 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
11557 V2 = DAG.getBitcast(VT, V2);
11559 if (V2Index != 0) {
11560 // If we have 4 or fewer lanes we can cheaply shuffle the element into
11561 // the desired position. Otherwise it is more efficient to do a vector
11562 // shift left. We know that we can do a vector shift left because all
11563 // the inputs are zero.
11564 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
11565 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
11566 V2Shuffle[V2Index] = 0;
11567 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
11569 V2 = DAG.getBitcast(MVT::v16i8, V2);
11571 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
11572 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
11573 V2 = DAG.getBitcast(VT, V2);
11579 /// Try to lower broadcast of a single - truncated - integer element,
11580 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
11582 /// This assumes we have AVX2.
11583 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
11584 SDValue V0, int BroadcastIdx,
11585 const X86Subtarget &Subtarget,
11586 SelectionDAG &DAG) {
11587 assert(Subtarget.hasAVX2() &&
11588 "We can only lower integer broadcasts with AVX2!");
11590 EVT EltVT = VT.getVectorElementType();
11591 EVT V0VT = V0.getValueType();
11593 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
11594 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
11596 EVT V0EltVT = V0VT.getVectorElementType();
11597 if (!V0EltVT.isInteger())
11600 const unsigned EltSize = EltVT.getSizeInBits();
11601 const unsigned V0EltSize = V0EltVT.getSizeInBits();
11603 // This is only a truncation if the original element type is larger.
11604 if (V0EltSize <= EltSize)
11607 assert(((V0EltSize % EltSize) == 0) &&
11608 "Scalar type sizes must all be powers of 2 on x86!");
11610 const unsigned V0Opc = V0.getOpcode();
11611 const unsigned Scale = V0EltSize / EltSize;
11612 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
11614 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
11615 V0Opc != ISD::BUILD_VECTOR)
11618 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
11620 // If we're extracting non-least-significant bits, shift so we can truncate.
11621 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
11622 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
11623 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
11624 if (const int OffsetIdx = BroadcastIdx % Scale)
11625 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
11626 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
11628 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
11629 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
11632 /// Try to lower broadcast of a single element.
11634 /// For convenience, this code also bundles all of the subtarget feature set
11635 /// filtering. While a little annoying to re-dispatch on type here, there isn't
11636 /// a convenient way to factor it out.
11637 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
11638 SDValue V1, SDValue V2,
11639 ArrayRef<int> Mask,
11640 const X86Subtarget &Subtarget,
11641 SelectionDAG &DAG) {
11642 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
11643 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
11644 (Subtarget.hasAVX2() && VT.isInteger())))
11647 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
11648 // we can only broadcast from a register with AVX2.
11649 unsigned NumElts = Mask.size();
11650 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
11652 : X86ISD::VBROADCAST;
11653 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
11655 // Check that the mask is a broadcast.
11656 int BroadcastIdx = -1;
11657 for (int i = 0; i != (int)NumElts; ++i) {
11658 SmallVector<int, 8> BroadcastMask(NumElts, i);
11659 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
11665 if (BroadcastIdx < 0)
11667 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
11668 "a sorted mask where the broadcast "
11671 // Go up the chain of (vector) values to find a scalar load that we can
11672 // combine with the broadcast.
11675 switch (V.getOpcode()) {
11676 case ISD::BITCAST: {
11677 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
11678 SDValue VSrc = V.getOperand(0);
11679 unsigned NumEltBits = V.getScalarValueSizeInBits();
11680 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
11681 if ((NumEltBits % NumSrcBits) == 0)
11682 BroadcastIdx *= (NumEltBits / NumSrcBits);
11683 else if ((NumSrcBits % NumEltBits) == 0 &&
11684 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
11685 BroadcastIdx /= (NumSrcBits / NumEltBits);
11691 case ISD::CONCAT_VECTORS: {
11693 V.getOperand(0).getSimpleValueType().getVectorNumElements();
11694 V = V.getOperand(BroadcastIdx / OperandSize);
11695 BroadcastIdx %= OperandSize;
11698 case ISD::INSERT_SUBVECTOR: {
11699 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
11700 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
11704 int BeginIdx = (int)ConstantIdx->getZExtValue();
11706 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
11707 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
11708 BroadcastIdx -= BeginIdx;
11719 // Ensure the source vector and BroadcastIdx are for a suitable type.
11720 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
11721 unsigned NumEltBits = VT.getScalarSizeInBits();
11722 unsigned NumSrcBits = V.getScalarValueSizeInBits();
11723 if ((NumSrcBits % NumEltBits) == 0)
11724 BroadcastIdx *= (NumSrcBits / NumEltBits);
11725 else if ((NumEltBits % NumSrcBits) == 0 &&
11726 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
11727 BroadcastIdx /= (NumEltBits / NumSrcBits);
11731 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
11732 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
11733 V = DAG.getBitcast(SrcVT, V);
11736 // Check if this is a broadcast of a scalar. We special case lowering
11737 // for scalars so that we can more effectively fold with loads.
11738 // First, look through bitcast: if the original value has a larger element
11739 // type than the shuffle, the broadcast element is in essence truncated.
11740 // Make that explicit to ease folding.
11741 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
11742 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
11743 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
11744 return TruncBroadcast;
11746 MVT BroadcastVT = VT;
11748 // Peek through any bitcast (only useful for loads).
11749 SDValue BC = peekThroughBitcasts(V);
11751 // Also check the simpler case, where we can directly reuse the scalar.
11752 if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
11753 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
11754 V = V.getOperand(BroadcastIdx);
11756 // If we can't broadcast from a register, check that the input is a load.
11757 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
11759 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
11760 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11761 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
11762 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
11763 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
11768 // If we are broadcasting a load that is only used by the shuffle
11769 // then we can reduce the vector load to the broadcasted scalar load.
11770 LoadSDNode *Ld = cast<LoadSDNode>(BC);
11771 SDValue BaseAddr = Ld->getOperand(1);
11772 EVT SVT = BroadcastVT.getScalarType();
11773 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
11774 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
11775 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
11776 DAG.getMachineFunction().getMachineMemOperand(
11777 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
11778 DAG.makeEquivalentMemoryOrdering(Ld, V);
11779 } else if (!BroadcastFromReg) {
11780 // We can't broadcast from a vector register.
11782 } else if (BroadcastIdx != 0) {
11783 // We can only broadcast from the zero-element of a vector register,
11784 // but it can be advantageous to broadcast from the zero-element of a
11786 if (!VT.is256BitVector() && !VT.is512BitVector())
11789 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
11790 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11793 // Only broadcast the zero-element of a 128-bit subvector.
11794 unsigned EltSize = VT.getScalarSizeInBits();
11795 if (((BroadcastIdx * EltSize) % 128) != 0)
11798 // The shuffle input might have been a bitcast we looked through; look at
11799 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
11800 // later bitcast it to BroadcastVT.
11801 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11802 "Unexpected vector element size");
11803 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
11804 "Unexpected vector size");
11805 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
11808 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
11809 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
11810 DAG.getBitcast(MVT::f64, V));
11812 // Bitcast back to the same scalar type as BroadcastVT.
11813 MVT SrcVT = V.getSimpleValueType();
11814 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
11815 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11816 "Unexpected vector element size");
11817 if (SrcVT.isVector()) {
11818 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11819 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
11821 SrcVT = BroadcastVT.getScalarType();
11823 V = DAG.getBitcast(SrcVT, V);
11826 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11827 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
11828 V = DAG.getBitcast(MVT::f64, V);
11829 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
11830 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
11833 // We only support broadcasting from 128-bit vectors to minimize the
11834 // number of patterns we need to deal with in isel. So extract down to
11835 // 128-bits, removing as many bitcasts as possible.
11836 if (SrcVT.getSizeInBits() > 128) {
11837 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
11838 128 / SrcVT.getScalarSizeInBits());
11839 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
11840 V = DAG.getBitcast(ExtVT, V);
11843 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
11846 // Check for whether we can use INSERTPS to perform the shuffle. We only use
11847 // INSERTPS when the V1 elements are already in the correct locations
11848 // because otherwise we can just always use two SHUFPS instructions which
11849 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
11850 // perform INSERTPS if a single V1 element is out of place and all V2
11851 // elements are zeroable.
11852 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
11853 unsigned &InsertPSMask,
11854 const APInt &Zeroable,
11855 ArrayRef<int> Mask,
11856 SelectionDAG &DAG) {
11857 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
11858 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
11859 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11861 // Attempt to match INSERTPS with one element from VA or VB being
11862 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
11864 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
11865 ArrayRef<int> CandidateMask) {
11866 unsigned ZMask = 0;
11867 int VADstIndex = -1;
11868 int VBDstIndex = -1;
11869 bool VAUsedInPlace = false;
11871 for (int i = 0; i < 4; ++i) {
11872 // Synthesize a zero mask from the zeroable elements (includes undefs).
11878 // Flag if we use any VA inputs in place.
11879 if (i == CandidateMask[i]) {
11880 VAUsedInPlace = true;
11884 // We can only insert a single non-zeroable element.
11885 if (VADstIndex >= 0 || VBDstIndex >= 0)
11888 if (CandidateMask[i] < 4) {
11889 // VA input out of place for insertion.
11892 // VB input for insertion.
11897 // Don't bother if we have no (non-zeroable) element for insertion.
11898 if (VADstIndex < 0 && VBDstIndex < 0)
11901 // Determine element insertion src/dst indices. The src index is from the
11902 // start of the inserted vector, not the start of the concatenated vector.
11903 unsigned VBSrcIndex = 0;
11904 if (VADstIndex >= 0) {
11905 // If we have a VA input out of place, we use VA as the V2 element
11906 // insertion and don't use the original V2 at all.
11907 VBSrcIndex = CandidateMask[VADstIndex];
11908 VBDstIndex = VADstIndex;
11911 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
11914 // If no V1 inputs are used in place, then the result is created only from
11915 // the zero mask and the V2 insertion - so remove V1 dependency.
11916 if (!VAUsedInPlace)
11917 VA = DAG.getUNDEF(MVT::v4f32);
11919 // Update V1, V2 and InsertPSMask accordingly.
11923 // Insert the V2 element into the desired position.
11924 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
11925 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
11929 if (matchAsInsertPS(V1, V2, Mask))
11932 // Commute and try again.
11933 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11934 ShuffleVectorSDNode::commuteMask(CommutedMask);
11935 if (matchAsInsertPS(V2, V1, CommutedMask))
11941 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
11942 SDValue V2, ArrayRef<int> Mask,
11943 const APInt &Zeroable,
11944 SelectionDAG &DAG) {
11945 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11946 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11948 // Attempt to match the insertps pattern.
11949 unsigned InsertPSMask;
11950 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
11953 // Insert the V2 element into the desired position.
11954 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
11955 DAG.getConstant(InsertPSMask, DL, MVT::i8));
11958 /// Try to lower a shuffle as a permute of the inputs followed by an
11959 /// UNPCK instruction.
11961 /// This specifically targets cases where we end up with alternating between
11962 /// the two inputs, and so can permute them into something that feeds a single
11963 /// UNPCK instruction. Note that this routine only targets integer vectors
11964 /// because for floating point vectors we have a generalized SHUFPS lowering
11965 /// strategy that handles everything that doesn't *exactly* match an unpack,
11966 /// making this clever lowering unnecessary.
11967 static SDValue lowerVectorShuffleAsPermuteAndUnpack(
11968 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11969 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11970 assert(!VT.isFloatingPoint() &&
11971 "This routine only supports integer vectors.");
11972 assert(VT.is128BitVector() &&
11973 "This routine only works on 128-bit vectors.");
11974 assert(!V2.isUndef() &&
11975 "This routine should only be used when blending two inputs.");
11976 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11978 int Size = Mask.size();
11981 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11983 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11985 bool UnpackLo = NumLoInputs >= NumHiInputs;
11987 auto TryUnpack = [&](int ScalarSize, int Scale) {
11988 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11989 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11991 for (int i = 0; i < Size; ++i) {
11995 // Each element of the unpack contains Scale elements from this mask.
11996 int UnpackIdx = i / Scale;
11998 // We only handle the case where V1 feeds the first slots of the unpack.
11999 // We rely on canonicalization to ensure this is the case.
12000 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
12003 // Setup the mask for this input. The indexing is tricky as we have to
12004 // handle the unpack stride.
12005 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
12006 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
12010 // If we will have to shuffle both inputs to use the unpack, check whether
12011 // we can just unpack first and shuffle the result. If so, skip this unpack.
12012 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
12013 !isNoopShuffleMask(V2Mask))
12016 // Shuffle the inputs into place.
12017 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12018 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12020 // Cast the inputs to the type we will use to unpack them.
12021 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
12022 V1 = DAG.getBitcast(UnpackVT, V1);
12023 V2 = DAG.getBitcast(UnpackVT, V2);
12025 // Unpack the inputs and cast the result back to the desired type.
12026 return DAG.getBitcast(
12027 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12028 UnpackVT, V1, V2));
12031 // We try each unpack from the largest to the smallest to try and find one
12032 // that fits this mask.
12033 int OrigScalarSize = VT.getScalarSizeInBits();
12034 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
12035 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
12038 // If we're shuffling with a zero vector then we're better off not doing
12039 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
12040 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
12041 ISD::isBuildVectorAllZeros(V2.getNode()))
12044 // If none of the unpack-rooted lowerings worked (or were profitable) try an
12046 if (NumLoInputs == 0 || NumHiInputs == 0) {
12047 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
12048 "We have to have *some* inputs!");
12049 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
12051 // FIXME: We could consider the total complexity of the permute of each
12052 // possible unpacking. Or at the least we should consider how many
12053 // half-crossings are created.
12054 // FIXME: We could consider commuting the unpacks.
12056 SmallVector<int, 32> PermMask((unsigned)Size, -1);
12057 for (int i = 0; i < Size; ++i) {
12061 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
12064 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
12066 return DAG.getVectorShuffle(
12067 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
12069 DAG.getUNDEF(VT), PermMask);
12075 /// Handle lowering of 2-lane 64-bit floating point shuffles.
12077 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
12078 /// support for floating point shuffles but not integer shuffles. These
12079 /// instructions will incur a domain crossing penalty on some chips though so
12080 /// it is better to avoid lowering through this for integer vectors where
12082 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12083 const APInt &Zeroable,
12084 SDValue V1, SDValue V2,
12085 const X86Subtarget &Subtarget,
12086 SelectionDAG &DAG) {
12087 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12088 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12089 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12091 if (V2.isUndef()) {
12092 // Check for being able to broadcast a single element.
12093 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12094 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
12097 // Straight shuffle of a single input vector. Simulate this by using the
12098 // single input as both of the "inputs" to this instruction..
12099 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12101 if (Subtarget.hasAVX()) {
12102 // If we have AVX, we can use VPERMILPS which will allow folding a load
12103 // into the shuffle.
12104 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12105 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
12108 return DAG.getNode(
12109 X86ISD::SHUFP, DL, MVT::v2f64,
12110 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12111 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12112 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
12114 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12115 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12116 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12117 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12119 // When loading a scalar and then shuffling it into a vector we can often do
12120 // the insertion cheaply.
12121 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12122 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12124 // Try inverting the insertion since for v2 masks it is easy to do and we
12125 // can't reliably sort the mask one way or the other.
12126 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12127 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12128 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12129 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12132 // Try to use one of the special instruction patterns to handle two common
12133 // blend patterns if a zero-blend above didn't work.
12134 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
12135 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
12136 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12137 // We can either use a special instruction to load over the low double or
12138 // to move just the low double.
12139 return DAG.getNode(
12140 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12141 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12143 if (Subtarget.hasSSE41())
12144 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12145 Zeroable, Subtarget, DAG))
12148 // Use dedicated unpack instructions for masks that match their pattern.
12150 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12153 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12154 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12155 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
12158 /// Handle lowering of 2-lane 64-bit integer shuffles.
12160 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12161 /// the integer unit to minimize domain crossing penalties. However, for blends
12162 /// it falls back to the floating point shuffle operation with appropriate bit
12164 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12165 const APInt &Zeroable,
12166 SDValue V1, SDValue V2,
12167 const X86Subtarget &Subtarget,
12168 SelectionDAG &DAG) {
12169 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12170 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12171 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12173 if (V2.isUndef()) {
12174 // Check for being able to broadcast a single element.
12175 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12176 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
12179 // Straight shuffle of a single input vector. For everything from SSE2
12180 // onward this has a single fast instruction with no scary immediates.
12181 // We have to map the mask as it is actually a v4i32 shuffle instruction.
12182 V1 = DAG.getBitcast(MVT::v4i32, V1);
12183 int WidenedMask[4] = {
12184 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
12185 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
12186 return DAG.getBitcast(
12188 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12189 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12191 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12192 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12193 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12194 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12196 // Try to use shift instructions.
12197 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
12198 Zeroable, Subtarget, DAG))
12201 // When loading a scalar and then shuffling it into a vector we can often do
12202 // the insertion cheaply.
12203 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12204 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12206 // Try inverting the insertion since for v2 masks it is easy to do and we
12207 // can't reliably sort the mask one way or the other.
12208 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12209 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12210 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12213 // We have different paths for blend lowering, but they all must use the
12214 // *exact* same predicate.
12215 bool IsBlendSupported = Subtarget.hasSSE41();
12216 if (IsBlendSupported)
12217 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12218 Zeroable, Subtarget, DAG))
12221 // Use dedicated unpack instructions for masks that match their pattern.
12223 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12226 // Try to use byte rotation instructions.
12227 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12228 if (Subtarget.hasSSSE3()) {
12229 if (Subtarget.hasVLX())
12230 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
12231 Mask, Subtarget, DAG))
12234 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12235 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
12239 // If we have direct support for blends, we should lower by decomposing into
12240 // a permute. That will be faster than the domain cross.
12241 if (IsBlendSupported)
12242 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
12243 Mask, Subtarget, DAG);
12245 // We implement this with SHUFPD which is pretty lame because it will likely
12246 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12247 // However, all the alternatives are still more cycles and newer chips don't
12248 // have this problem. It would be really nice if x86 had better shuffles here.
12249 V1 = DAG.getBitcast(MVT::v2f64, V1);
12250 V2 = DAG.getBitcast(MVT::v2f64, V2);
12251 return DAG.getBitcast(MVT::v2i64,
12252 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12255 /// Test whether this can be lowered with a single SHUFPS instruction.
12257 /// This is used to disable more specialized lowerings when the shufps lowering
12258 /// will happen to be efficient.
12259 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
12260 // This routine only handles 128-bit shufps.
12261 assert(Mask.size() == 4 && "Unsupported mask size!");
12262 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12263 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12264 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12265 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12267 // To lower with a single SHUFPS we need to have the low half and high half
12268 // each requiring a single input.
12269 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12271 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12277 /// Lower a vector shuffle using the SHUFPS instruction.
12279 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12280 /// It makes no assumptions about whether this is the *best* lowering, it simply
12282 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
12283 ArrayRef<int> Mask, SDValue V1,
12284 SDValue V2, SelectionDAG &DAG) {
12285 SDValue LowV = V1, HighV = V2;
12286 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
12288 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12290 if (NumV2Elements == 1) {
12291 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12293 // Compute the index adjacent to V2Index and in the same half by toggling
12295 int V2AdjIndex = V2Index ^ 1;
12297 if (Mask[V2AdjIndex] < 0) {
12298 // Handles all the cases where we have a single V2 element and an undef.
12299 // This will only ever happen in the high lanes because we commute the
12300 // vector otherwise.
12302 std::swap(LowV, HighV);
12303 NewMask[V2Index] -= 4;
12305 // Handle the case where the V2 element ends up adjacent to a V1 element.
12306 // To make this work, blend them together as the first step.
12307 int V1Index = V2AdjIndex;
12308 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
12309 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
12310 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12312 // Now proceed to reconstruct the final blend as we have the necessary
12313 // high or low half formed.
12320 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
12321 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
12323 } else if (NumV2Elements == 2) {
12324 if (Mask[0] < 4 && Mask[1] < 4) {
12325 // Handle the easy case where we have V1 in the low lanes and V2 in the
12329 } else if (Mask[2] < 4 && Mask[3] < 4) {
12330 // We also handle the reversed case because this utility may get called
12331 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
12332 // arrange things in the right direction.
12338 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
12339 // trying to place elements directly, just blend them and set up the final
12340 // shuffle to place them.
12342 // The first two blend mask elements are for V1, the second two are for
12344 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
12345 Mask[2] < 4 ? Mask[2] : Mask[3],
12346 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
12347 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
12348 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12349 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12351 // Now we do a normal shuffle of V1 by giving V1 as both operands to
12354 NewMask[0] = Mask[0] < 4 ? 0 : 2;
12355 NewMask[1] = Mask[0] < 4 ? 2 : 0;
12356 NewMask[2] = Mask[2] < 4 ? 1 : 3;
12357 NewMask[3] = Mask[2] < 4 ? 3 : 1;
12360 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
12361 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
12364 /// Lower 4-lane 32-bit floating point shuffles.
12366 /// Uses instructions exclusively from the floating point unit to minimize
12367 /// domain crossing penalties, as these are sufficient to implement all v4f32
12369 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12370 const APInt &Zeroable,
12371 SDValue V1, SDValue V2,
12372 const X86Subtarget &Subtarget,
12373 SelectionDAG &DAG) {
12374 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12375 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12376 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12378 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12380 if (NumV2Elements == 0) {
12381 // Check for being able to broadcast a single element.
12382 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12383 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
12386 // Use even/odd duplicate instructions for masks that match their pattern.
12387 if (Subtarget.hasSSE3()) {
12388 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12389 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
12390 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
12391 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
12394 if (Subtarget.hasAVX()) {
12395 // If we have AVX, we can use VPERMILPS which will allow folding a load
12396 // into the shuffle.
12397 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
12398 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12401 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
12402 // in SSE1 because otherwise they are widened to v2f64 and never get here.
12403 if (!Subtarget.hasSSE2()) {
12404 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
12405 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
12406 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
12407 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
12410 // Otherwise, use a straight shuffle of a single input vector. We pass the
12411 // input vector to both operands to simulate this with a SHUFPS.
12412 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
12413 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12416 // There are special ways we can lower some single-element blends. However, we
12417 // have custom ways we can lower more complex single-element blends below that
12418 // we defer to if both this and BLENDPS fail to match, so restrict this to
12419 // when the V2 input is targeting element 0 of the mask -- that is the fast
12421 if (NumV2Elements == 1 && Mask[0] >= 4)
12422 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12423 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12426 if (Subtarget.hasSSE41()) {
12427 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
12428 Zeroable, Subtarget, DAG))
12431 // Use INSERTPS if we can complete the shuffle efficiently.
12433 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
12436 if (!isSingleSHUFPSMask(Mask))
12437 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
12438 DL, MVT::v4f32, V1, V2, Mask, DAG))
12442 // Use low/high mov instructions. These are only valid in SSE1 because
12443 // otherwise they are widened to v2f64 and never get here.
12444 if (!Subtarget.hasSSE2()) {
12445 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
12446 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
12447 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
12448 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
12451 // Use dedicated unpack instructions for masks that match their pattern.
12453 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
12456 // Otherwise fall back to a SHUFPS lowering strategy.
12457 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
12460 /// Lower 4-lane i32 vector shuffles.
12462 /// We try to handle these with integer-domain shuffles where we can, but for
12463 /// blends we use the floating point domain blend instructions.
12464 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12465 const APInt &Zeroable,
12466 SDValue V1, SDValue V2,
12467 const X86Subtarget &Subtarget,
12468 SelectionDAG &DAG) {
12469 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
12470 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
12471 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12473 // Whenever we can lower this as a zext, that instruction is strictly faster
12474 // than any alternative. It also allows us to fold memory operands into the
12475 // shuffle in many cases.
12476 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12477 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12480 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12482 if (NumV2Elements == 0) {
12483 // Check for being able to broadcast a single element.
12484 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12485 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
12488 // Straight shuffle of a single input vector. For everything from SSE2
12489 // onward this has a single fast instruction with no scary immediates.
12490 // We coerce the shuffle pattern to be compatible with UNPCK instructions
12491 // but we aren't actually going to use the UNPCK instruction because doing
12492 // so prevents folding a load into this instruction or making a copy.
12493 const int UnpackLoMask[] = {0, 0, 1, 1};
12494 const int UnpackHiMask[] = {2, 2, 3, 3};
12495 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
12496 Mask = UnpackLoMask;
12497 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
12498 Mask = UnpackHiMask;
12500 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12501 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12504 // Try to use shift instructions.
12505 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
12506 Zeroable, Subtarget, DAG))
12509 // There are special ways we can lower some single-element blends.
12510 if (NumV2Elements == 1)
12511 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12512 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12515 // We have different paths for blend lowering, but they all must use the
12516 // *exact* same predicate.
12517 bool IsBlendSupported = Subtarget.hasSSE41();
12518 if (IsBlendSupported)
12519 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
12520 Zeroable, Subtarget, DAG))
12523 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
12527 // Use dedicated unpack instructions for masks that match their pattern.
12529 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
12532 // Try to use byte rotation instructions.
12533 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12534 if (Subtarget.hasSSSE3()) {
12535 if (Subtarget.hasVLX())
12536 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
12537 Mask, Subtarget, DAG))
12540 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12541 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
12545 // Assume that a single SHUFPS is faster than an alternative sequence of
12546 // multiple instructions (even if the CPU has a domain penalty).
12547 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12548 if (!isSingleSHUFPSMask(Mask)) {
12549 // If we have direct support for blends, we should lower by decomposing into
12550 // a permute. That will be faster than the domain cross.
12551 if (IsBlendSupported)
12552 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
12553 Mask, Subtarget, DAG);
12555 // Try to lower by permuting the inputs into an unpack instruction.
12556 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12557 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
12561 // We implement this with SHUFPS because it can blend from two vectors.
12562 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
12563 // up the inputs, bypassing domain shift penalties that we would incur if we
12564 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
12566 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
12567 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
12568 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
12569 return DAG.getBitcast(MVT::v4i32, ShufPS);
12572 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
12573 /// shuffle lowering, and the most complex part.
12575 /// The lowering strategy is to try to form pairs of input lanes which are
12576 /// targeted at the same half of the final vector, and then use a dword shuffle
12577 /// to place them onto the right half, and finally unpack the paired lanes into
12578 /// their final position.
12580 /// The exact breakdown of how to form these dword pairs and align them on the
12581 /// correct sides is really tricky. See the comments within the function for
12582 /// more of the details.
12584 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
12585 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
12586 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
12587 /// vector, form the analogous 128-bit 8-element Mask.
12588 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
12589 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
12590 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12591 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
12592 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
12594 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
12595 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
12596 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
12598 // Attempt to directly match PSHUFLW or PSHUFHW.
12599 if (isUndefOrInRange(LoMask, 0, 4) &&
12600 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
12601 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12602 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
12604 if (isUndefOrInRange(HiMask, 4, 8) &&
12605 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
12606 for (int i = 0; i != 4; ++i)
12607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
12608 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12609 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
12612 SmallVector<int, 4> LoInputs;
12613 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
12614 array_pod_sort(LoInputs.begin(), LoInputs.end());
12615 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
12616 SmallVector<int, 4> HiInputs;
12617 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
12618 array_pod_sort(HiInputs.begin(), HiInputs.end());
12619 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
12621 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
12622 int NumHToL = LoInputs.size() - NumLToL;
12624 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
12625 int NumHToH = HiInputs.size() - NumLToH;
12626 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
12627 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
12628 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
12629 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
12631 // If we are shuffling values from one half - check how many different DWORD
12632 // pairs we need to create. If only 1 or 2 then we can perform this as a
12633 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
12634 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
12635 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
12636 V = DAG.getNode(ShufWOp, DL, VT, V,
12637 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
12638 V = DAG.getBitcast(PSHUFDVT, V);
12639 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
12640 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12641 return DAG.getBitcast(VT, V);
12644 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
12645 int PSHUFDMask[4] = { -1, -1, -1, -1 };
12646 SmallVector<std::pair<int, int>, 4> DWordPairs;
12647 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
12649 // Collect the different DWORD pairs.
12650 for (int DWord = 0; DWord != 4; ++DWord) {
12651 int M0 = Mask[2 * DWord + 0];
12652 int M1 = Mask[2 * DWord + 1];
12653 M0 = (M0 >= 0 ? M0 % 4 : M0);
12654 M1 = (M1 >= 0 ? M1 % 4 : M1);
12655 if (M0 < 0 && M1 < 0)
12658 bool Match = false;
12659 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
12660 auto &DWordPair = DWordPairs[j];
12661 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
12662 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
12663 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
12664 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
12665 PSHUFDMask[DWord] = DOffset + j;
12671 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
12672 DWordPairs.push_back(std::make_pair(M0, M1));
12676 if (DWordPairs.size() <= 2) {
12677 DWordPairs.resize(2, std::make_pair(-1, -1));
12678 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
12679 DWordPairs[1].first, DWordPairs[1].second};
12680 if ((NumHToL + NumHToH) == 0)
12681 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
12682 if ((NumLToL + NumLToH) == 0)
12683 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
12687 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
12688 // such inputs we can swap two of the dwords across the half mark and end up
12689 // with <=2 inputs to each half in each half. Once there, we can fall through
12690 // to the generic code below. For example:
12692 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
12693 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
12695 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
12696 // and an existing 2-into-2 on the other half. In this case we may have to
12697 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
12698 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
12699 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
12700 // because any other situation (including a 3-into-1 or 1-into-3 in the other
12701 // half than the one we target for fixing) will be fixed when we re-enter this
12702 // path. We will also combine away any sequence of PSHUFD instructions that
12703 // result into a single instruction. Here is an example of the tricky case:
12705 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
12706 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
12708 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
12710 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
12711 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
12713 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
12714 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
12716 // The result is fine to be handled by the generic logic.
12717 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
12718 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
12719 int AOffset, int BOffset) {
12720 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
12721 "Must call this with A having 3 or 1 inputs from the A half.");
12722 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
12723 "Must call this with B having 1 or 3 inputs from the B half.");
12724 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
12725 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
12727 bool ThreeAInputs = AToAInputs.size() == 3;
12729 // Compute the index of dword with only one word among the three inputs in
12730 // a half by taking the sum of the half with three inputs and subtracting
12731 // the sum of the actual three inputs. The difference is the remaining
12733 int ADWord, BDWord;
12734 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
12735 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
12736 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
12737 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
12738 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
12739 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
12740 int TripleNonInputIdx =
12741 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
12742 TripleDWord = TripleNonInputIdx / 2;
12744 // We use xor with one to compute the adjacent DWord to whichever one the
12746 OneInputDWord = (OneInput / 2) ^ 1;
12748 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
12749 // and BToA inputs. If there is also such a problem with the BToB and AToB
12750 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
12751 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
12752 // is essential that we don't *create* a 3<-1 as then we might oscillate.
12753 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
12754 // Compute how many inputs will be flipped by swapping these DWords. We
12756 // to balance this to ensure we don't form a 3-1 shuffle in the other
12758 int NumFlippedAToBInputs =
12759 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
12760 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
12761 int NumFlippedBToBInputs =
12762 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
12763 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
12764 if ((NumFlippedAToBInputs == 1 &&
12765 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
12766 (NumFlippedBToBInputs == 1 &&
12767 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
12768 // We choose whether to fix the A half or B half based on whether that
12769 // half has zero flipped inputs. At zero, we may not be able to fix it
12770 // with that half. We also bias towards fixing the B half because that
12771 // will more commonly be the high half, and we have to bias one way.
12772 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
12773 ArrayRef<int> Inputs) {
12774 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
12775 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
12776 // Determine whether the free index is in the flipped dword or the
12777 // unflipped dword based on where the pinned index is. We use this bit
12778 // in an xor to conditionally select the adjacent dword.
12779 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
12780 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12781 if (IsFixIdxInput == IsFixFreeIdxInput)
12783 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12784 assert(IsFixIdxInput != IsFixFreeIdxInput &&
12785 "We need to be changing the number of flipped inputs!");
12786 int PSHUFHalfMask[] = {0, 1, 2, 3};
12787 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
12789 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
12790 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
12791 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
12793 for (int &M : Mask)
12794 if (M >= 0 && M == FixIdx)
12796 else if (M >= 0 && M == FixFreeIdx)
12799 if (NumFlippedBToBInputs != 0) {
12801 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
12802 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
12804 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
12805 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
12806 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
12811 int PSHUFDMask[] = {0, 1, 2, 3};
12812 PSHUFDMask[ADWord] = BDWord;
12813 PSHUFDMask[BDWord] = ADWord;
12814 V = DAG.getBitcast(
12816 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12817 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12819 // Adjust the mask to match the new locations of A and B.
12820 for (int &M : Mask)
12821 if (M >= 0 && M/2 == ADWord)
12822 M = 2 * BDWord + M % 2;
12823 else if (M >= 0 && M/2 == BDWord)
12824 M = 2 * ADWord + M % 2;
12826 // Recurse back into this routine to re-compute state now that this isn't
12827 // a 3 and 1 problem.
12828 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
12831 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
12832 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
12833 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
12834 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
12836 // At this point there are at most two inputs to the low and high halves from
12837 // each half. That means the inputs can always be grouped into dwords and
12838 // those dwords can then be moved to the correct half with a dword shuffle.
12839 // We use at most one low and one high word shuffle to collect these paired
12840 // inputs into dwords, and finally a dword shuffle to place them.
12841 int PSHUFLMask[4] = {-1, -1, -1, -1};
12842 int PSHUFHMask[4] = {-1, -1, -1, -1};
12843 int PSHUFDMask[4] = {-1, -1, -1, -1};
12845 // First fix the masks for all the inputs that are staying in their
12846 // original halves. This will then dictate the targets of the cross-half
12848 auto fixInPlaceInputs =
12849 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
12850 MutableArrayRef<int> SourceHalfMask,
12851 MutableArrayRef<int> HalfMask, int HalfOffset) {
12852 if (InPlaceInputs.empty())
12854 if (InPlaceInputs.size() == 1) {
12855 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12856 InPlaceInputs[0] - HalfOffset;
12857 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
12860 if (IncomingInputs.empty()) {
12861 // Just fix all of the in place inputs.
12862 for (int Input : InPlaceInputs) {
12863 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
12864 PSHUFDMask[Input / 2] = Input / 2;
12869 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
12870 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12871 InPlaceInputs[0] - HalfOffset;
12872 // Put the second input next to the first so that they are packed into
12873 // a dword. We find the adjacent index by toggling the low bit.
12874 int AdjIndex = InPlaceInputs[0] ^ 1;
12875 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
12876 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
12877 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
12879 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
12880 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
12882 // Now gather the cross-half inputs and place them into a free dword of
12883 // their target half.
12884 // FIXME: This operation could almost certainly be simplified dramatically to
12885 // look more like the 3-1 fixing operation.
12886 auto moveInputsToRightHalf = [&PSHUFDMask](
12887 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
12888 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
12889 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
12891 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
12892 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
12894 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
12896 int LowWord = Word & ~1;
12897 int HighWord = Word | 1;
12898 return isWordClobbered(SourceHalfMask, LowWord) ||
12899 isWordClobbered(SourceHalfMask, HighWord);
12902 if (IncomingInputs.empty())
12905 if (ExistingInputs.empty()) {
12906 // Map any dwords with inputs from them into the right half.
12907 for (int Input : IncomingInputs) {
12908 // If the source half mask maps over the inputs, turn those into
12909 // swaps and use the swapped lane.
12910 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
12911 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
12912 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
12913 Input - SourceOffset;
12914 // We have to swap the uses in our half mask in one sweep.
12915 for (int &M : HalfMask)
12916 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
12918 else if (M == Input)
12919 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12921 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
12922 Input - SourceOffset &&
12923 "Previous placement doesn't match!");
12925 // Note that this correctly re-maps both when we do a swap and when
12926 // we observe the other side of the swap above. We rely on that to
12927 // avoid swapping the members of the input list directly.
12928 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12931 // Map the input's dword into the correct half.
12932 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
12933 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
12935 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
12937 "Previous placement doesn't match!");
12940 // And just directly shift any other-half mask elements to be same-half
12941 // as we will have mirrored the dword containing the element into the
12942 // same position within that half.
12943 for (int &M : HalfMask)
12944 if (M >= SourceOffset && M < SourceOffset + 4) {
12945 M = M - SourceOffset + DestOffset;
12946 assert(M >= 0 && "This should never wrap below zero!");
12951 // Ensure we have the input in a viable dword of its current half. This
12952 // is particularly tricky because the original position may be clobbered
12953 // by inputs being moved and *staying* in that half.
12954 if (IncomingInputs.size() == 1) {
12955 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12956 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
12958 SourceHalfMask[InputFixed - SourceOffset] =
12959 IncomingInputs[0] - SourceOffset;
12960 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
12962 IncomingInputs[0] = InputFixed;
12964 } else if (IncomingInputs.size() == 2) {
12965 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
12966 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12967 // We have two non-adjacent or clobbered inputs we need to extract from
12968 // the source half. To do this, we need to map them into some adjacent
12969 // dword slot in the source mask.
12970 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
12971 IncomingInputs[1] - SourceOffset};
12973 // If there is a free slot in the source half mask adjacent to one of
12974 // the inputs, place the other input in it. We use (Index XOR 1) to
12975 // compute an adjacent index.
12976 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
12977 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
12978 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
12979 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12980 InputsFixed[1] = InputsFixed[0] ^ 1;
12981 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
12982 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
12983 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
12984 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
12985 InputsFixed[0] = InputsFixed[1] ^ 1;
12986 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
12987 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
12988 // The two inputs are in the same DWord but it is clobbered and the
12989 // adjacent DWord isn't used at all. Move both inputs to the free
12991 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
12992 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
12993 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
12994 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
12996 // The only way we hit this point is if there is no clobbering
12997 // (because there are no off-half inputs to this half) and there is no
12998 // free slot adjacent to one of the inputs. In this case, we have to
12999 // swap an input with a non-input.
13000 for (int i = 0; i < 4; ++i)
13001 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13002 "We can't handle any clobbers here!");
13003 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13004 "Cannot have adjacent inputs here!");
13006 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13007 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13009 // We also have to update the final source mask in this case because
13010 // it may need to undo the above swap.
13011 for (int &M : FinalSourceHalfMask)
13012 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13013 M = InputsFixed[1] + SourceOffset;
13014 else if (M == InputsFixed[1] + SourceOffset)
13015 M = (InputsFixed[0] ^ 1) + SourceOffset;
13017 InputsFixed[1] = InputsFixed[0] ^ 1;
13020 // Point everything at the fixed inputs.
13021 for (int &M : HalfMask)
13022 if (M == IncomingInputs[0])
13023 M = InputsFixed[0] + SourceOffset;
13024 else if (M == IncomingInputs[1])
13025 M = InputsFixed[1] + SourceOffset;
13027 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13028 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13031 llvm_unreachable("Unhandled input size!");
13034 // Now hoist the DWord down to the right half.
13035 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13036 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13037 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13038 for (int &M : HalfMask)
13039 for (int Input : IncomingInputs)
13041 M = FreeDWord * 2 + Input % 2;
13043 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13044 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13045 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13046 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13048 // Now enact all the shuffles we've computed to move the inputs into their
13050 if (!isNoopShuffleMask(PSHUFLMask))
13051 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13052 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13053 if (!isNoopShuffleMask(PSHUFHMask))
13054 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13055 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13056 if (!isNoopShuffleMask(PSHUFDMask))
13057 V = DAG.getBitcast(
13059 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13060 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13062 // At this point, each half should contain all its inputs, and we can then
13063 // just shuffle them into their final position.
13064 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13065 "Failed to lift all the high half inputs to the low mask!");
13066 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13067 "Failed to lift all the low half inputs to the high mask!");
13069 // Do a half shuffle for the low mask.
13070 if (!isNoopShuffleMask(LoMask))
13071 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13072 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13074 // Do a half shuffle with the high mask after shifting its values down.
13075 for (int &M : HiMask)
13078 if (!isNoopShuffleMask(HiMask))
13079 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13080 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13085 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13086 /// blend if only one input is used.
13087 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
13088 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13089 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13090 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
13091 "Lane crossing shuffle masks not supported");
13093 int NumBytes = VT.getSizeInBits() / 8;
13094 int Size = Mask.size();
13095 int Scale = NumBytes / Size;
13097 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13098 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13102 for (int i = 0; i < NumBytes; ++i) {
13103 int M = Mask[i / Scale];
13107 const int ZeroMask = 0x80;
13108 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13109 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13110 if (Zeroable[i / Scale])
13111 V1Idx = V2Idx = ZeroMask;
13113 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13114 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13115 V1InUse |= (ZeroMask != V1Idx);
13116 V2InUse |= (ZeroMask != V2Idx);
13119 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13121 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13122 DAG.getBuildVector(ShufVT, DL, V1Mask));
13124 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13125 DAG.getBuildVector(ShufVT, DL, V2Mask));
13127 // If we need shuffled inputs from both, blend the two.
13129 if (V1InUse && V2InUse)
13130 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13132 V = V1InUse ? V1 : V2;
13134 // Cast the result back to the correct type.
13135 return DAG.getBitcast(VT, V);
13138 /// Generic lowering of 8-lane i16 shuffles.
13140 /// This handles both single-input shuffles and combined shuffle/blends with
13141 /// two inputs. The single input shuffles are immediately delegated to
13142 /// a dedicated lowering routine.
13144 /// The blends are lowered in one of three fundamental ways. If there are few
13145 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13146 /// of the input is significantly cheaper when lowered as an interleaving of
13147 /// the two inputs, try to interleave them. Otherwise, blend the low and high
13148 /// halves of the inputs separately (making them have relatively few inputs)
13149 /// and then concatenate them.
13150 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13151 const APInt &Zeroable,
13152 SDValue V1, SDValue V2,
13153 const X86Subtarget &Subtarget,
13154 SelectionDAG &DAG) {
13155 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13156 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13157 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13159 // Whenever we can lower this as a zext, that instruction is strictly faster
13160 // than any alternative.
13161 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13162 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13165 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13167 if (NumV2Inputs == 0) {
13168 // Check for being able to broadcast a single element.
13169 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13170 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
13173 // Try to use shift instructions.
13174 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
13175 Zeroable, Subtarget, DAG))
13178 // Use dedicated unpack instructions for masks that match their pattern.
13180 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13183 // Use dedicated pack instructions for masks that match their pattern.
13184 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
13188 // Try to use byte rotation instructions.
13189 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
13190 Mask, Subtarget, DAG))
13193 // Make a copy of the mask so it can be modified.
13194 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
13195 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
13196 MutableMask, Subtarget,
13200 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13201 "All single-input shuffles should be canonicalized to be V1-input "
13204 // Try to use shift instructions.
13205 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
13206 Zeroable, Subtarget, DAG))
13209 // See if we can use SSE4A Extraction / Insertion.
13210 if (Subtarget.hasSSE4A())
13211 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13215 // There are special ways we can lower some single-element blends.
13216 if (NumV2Inputs == 1)
13217 if (SDValue V = lowerVectorShuffleAsElementInsertion(
13218 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13221 // We have different paths for blend lowering, but they all must use the
13222 // *exact* same predicate.
13223 bool IsBlendSupported = Subtarget.hasSSE41();
13224 if (IsBlendSupported)
13225 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13226 Zeroable, Subtarget, DAG))
13229 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13233 // Use dedicated unpack instructions for masks that match their pattern.
13235 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13238 // Use dedicated pack instructions for masks that match their pattern.
13239 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13243 // Try to use byte rotation instructions.
13244 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13245 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
13248 if (SDValue BitBlend =
13249 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
13252 // Try to lower by permuting the inputs into an unpack instruction.
13253 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
13254 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
13257 // If we can't directly blend but can use PSHUFB, that will be better as it
13258 // can both shuffle and set up the inefficient blend.
13259 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
13260 bool V1InUse, V2InUse;
13261 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
13262 Zeroable, DAG, V1InUse, V2InUse);
13265 // We can always bit-blend if we have to so the fallback strategy is to
13266 // decompose into single-input permutes and blends.
13267 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
13268 Mask, Subtarget, DAG);
13271 /// Check whether a compaction lowering can be done by dropping even
13272 /// elements and compute how many times even elements must be dropped.
13274 /// This handles shuffles which take every Nth element where N is a power of
13275 /// two. Example shuffle masks:
13277 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
13278 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
13279 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
13280 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
13281 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
13282 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
13284 /// Any of these lanes can of course be undef.
13286 /// This routine only supports N <= 3.
13287 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
13290 /// \returns N above, or the number of times even elements must be dropped if
13291 /// there is such a number. Otherwise returns zero.
13292 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
13293 bool IsSingleInput) {
13294 // The modulus for the shuffle vector entries is based on whether this is
13295 // a single input or not.
13296 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
13297 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
13298 "We should only be called with masks with a power-of-2 size!");
13300 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
13302 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
13303 // and 2^3 simultaneously. This is because we may have ambiguity with
13304 // partially undef inputs.
13305 bool ViableForN[3] = {true, true, true};
13307 for (int i = 0, e = Mask.size(); i < e; ++i) {
13308 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
13313 bool IsAnyViable = false;
13314 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
13315 if (ViableForN[j]) {
13316 uint64_t N = j + 1;
13318 // The shuffle mask must be equal to (i * 2^N) % M.
13319 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
13320 IsAnyViable = true;
13322 ViableForN[j] = false;
13324 // Early exit if we exhaust the possible powers of two.
13329 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
13333 // Return 0 as there is no viable power of two.
13337 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
13338 ArrayRef<int> Mask, SDValue V1,
13339 SDValue V2, SelectionDAG &DAG) {
13340 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
13341 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
13343 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
13345 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
13347 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
13350 /// Generic lowering of v16i8 shuffles.
13352 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
13353 /// detect any complexity reducing interleaving. If that doesn't help, it uses
13354 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
13355 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
13357 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13358 const APInt &Zeroable,
13359 SDValue V1, SDValue V2,
13360 const X86Subtarget &Subtarget,
13361 SelectionDAG &DAG) {
13362 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
13363 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
13364 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13366 // Try to use shift instructions.
13367 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
13368 Zeroable, Subtarget, DAG))
13371 // Try to use byte rotation instructions.
13372 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13373 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
13376 // Use dedicated pack instructions for masks that match their pattern.
13377 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
13381 // Try to use a zext lowering.
13382 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13383 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13386 // See if we can use SSE4A Extraction / Insertion.
13387 if (Subtarget.hasSSE4A())
13388 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
13392 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
13394 // For single-input shuffles, there are some nicer lowering tricks we can use.
13395 if (NumV2Elements == 0) {
13396 // Check for being able to broadcast a single element.
13397 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13398 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
13402 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
13405 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
13406 // Notably, this handles splat and partial-splat shuffles more efficiently.
13407 // However, it only makes sense if the pre-duplication shuffle simplifies
13408 // things significantly. Currently, this means we need to be able to
13409 // express the pre-duplication shuffle as an i16 shuffle.
13411 // FIXME: We should check for other patterns which can be widened into an
13412 // i16 shuffle as well.
13413 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
13414 for (int i = 0; i < 16; i += 2)
13415 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
13420 auto tryToWidenViaDuplication = [&]() -> SDValue {
13421 if (!canWidenViaDuplication(Mask))
13423 SmallVector<int, 4> LoInputs;
13424 copy_if(Mask, std::back_inserter(LoInputs),
13425 [](int M) { return M >= 0 && M < 8; });
13426 array_pod_sort(LoInputs.begin(), LoInputs.end());
13427 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
13429 SmallVector<int, 4> HiInputs;
13430 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
13431 array_pod_sort(HiInputs.begin(), HiInputs.end());
13432 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
13435 bool TargetLo = LoInputs.size() >= HiInputs.size();
13436 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
13437 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
13439 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
13440 SmallDenseMap<int, int, 8> LaneMap;
13441 for (int I : InPlaceInputs) {
13442 PreDupI16Shuffle[I/2] = I/2;
13445 int j = TargetLo ? 0 : 4, je = j + 4;
13446 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
13447 // Check if j is already a shuffle of this input. This happens when
13448 // there are two adjacent bytes after we move the low one.
13449 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
13450 // If we haven't yet mapped the input, search for a slot into which
13452 while (j < je && PreDupI16Shuffle[j] >= 0)
13456 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
13459 // Map this input with the i16 shuffle.
13460 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
13463 // Update the lane map based on the mapping we ended up with.
13464 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
13466 V1 = DAG.getBitcast(
13468 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
13469 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
13471 // Unpack the bytes to form the i16s that will be shuffled into place.
13472 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13473 MVT::v16i8, V1, V1);
13475 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
13476 for (int i = 0; i < 16; ++i)
13477 if (Mask[i] >= 0) {
13478 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
13479 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
13480 if (PostDupI16Shuffle[i / 2] < 0)
13481 PostDupI16Shuffle[i / 2] = MappedMask;
13483 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
13484 "Conflicting entries in the original shuffle!");
13486 return DAG.getBitcast(
13488 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
13489 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
13491 if (SDValue V = tryToWidenViaDuplication())
13495 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
13499 // Use dedicated unpack instructions for masks that match their pattern.
13501 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
13504 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
13505 // with PSHUFB. It is important to do this before we attempt to generate any
13506 // blends but after all of the single-input lowerings. If the single input
13507 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
13508 // want to preserve that and we can DAG combine any longer sequences into
13509 // a PSHUFB in the end. But once we start blending from multiple inputs,
13510 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
13511 // and there are *very* few patterns that would actually be faster than the
13512 // PSHUFB approach because of its ability to zero lanes.
13514 // FIXME: The only exceptions to the above are blends which are exact
13515 // interleavings with direct instructions supporting them. We currently don't
13516 // handle those well here.
13517 if (Subtarget.hasSSSE3()) {
13518 bool V1InUse = false;
13519 bool V2InUse = false;
13521 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
13522 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
13524 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
13525 // do so. This avoids using them to handle blends-with-zero which is
13526 // important as a single pshufb is significantly faster for that.
13527 if (V1InUse && V2InUse) {
13528 if (Subtarget.hasSSE41())
13529 if (SDValue Blend = lowerVectorShuffleAsBlend(
13530 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13533 // We can use an unpack to do the blending rather than an or in some
13534 // cases. Even though the or may be (very minorly) more efficient, we
13535 // preference this lowering because there are common cases where part of
13536 // the complexity of the shuffles goes away when we do the final blend as
13538 // FIXME: It might be worth trying to detect if the unpack-feeding
13539 // shuffles will both be pshufb, in which case we shouldn't bother with
13541 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
13542 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
13545 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
13546 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
13547 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
13549 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
13550 // PALIGNR will be cheaper than the second PSHUFB+OR.
13551 if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute(
13552 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
13559 // There are special ways we can lower some single-element blends.
13560 if (NumV2Elements == 1)
13561 if (SDValue V = lowerVectorShuffleAsElementInsertion(
13562 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13565 if (SDValue BitBlend =
13566 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
13569 // Check whether a compaction lowering can be done. This handles shuffles
13570 // which take every Nth element for some even N. See the helper function for
13573 // We special case these as they can be particularly efficiently handled with
13574 // the PACKUSB instruction on x86 and they show up in common patterns of
13575 // rearranging bytes to truncate wide elements.
13576 bool IsSingleInput = V2.isUndef();
13577 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
13578 // NumEvenDrops is the power of two stride of the elements. Another way of
13579 // thinking about it is that we need to drop the even elements this many
13580 // times to get the original input.
13582 // First we need to zero all the dropped bytes.
13583 assert(NumEvenDrops <= 3 &&
13584 "No support for dropping even elements more than 3 times.");
13585 // We use the mask type to pick which bytes are preserved based on how many
13586 // elements are dropped.
13587 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
13588 SDValue ByteClearMask = DAG.getBitcast(
13589 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
13590 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
13591 if (!IsSingleInput)
13592 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
13594 // Now pack things back together.
13595 V1 = DAG.getBitcast(MVT::v8i16, V1);
13596 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
13597 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
13598 for (int i = 1; i < NumEvenDrops; ++i) {
13599 Result = DAG.getBitcast(MVT::v8i16, Result);
13600 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
13606 // Handle multi-input cases by blending single-input shuffles.
13607 if (NumV2Elements > 0)
13608 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
13609 Mask, Subtarget, DAG);
13611 // The fallback path for single-input shuffles widens this into two v8i16
13612 // vectors with unpacks, shuffles those, and then pulls them back together
13616 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
13617 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
13618 for (int i = 0; i < 16; ++i)
13620 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
13622 SDValue VLoHalf, VHiHalf;
13623 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
13624 // them out and avoid using UNPCK{L,H} to extract the elements of V as
13626 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
13627 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
13628 // Use a mask to drop the high bytes.
13629 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
13630 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
13631 DAG.getConstant(0x00FF, DL, MVT::v8i16));
13633 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
13634 VHiHalf = DAG.getUNDEF(MVT::v8i16);
13636 // Squash the masks to point directly into VLoHalf.
13637 for (int &M : LoBlendMask)
13640 for (int &M : HiBlendMask)
13644 // Otherwise just unpack the low half of V into VLoHalf and the high half into
13645 // VHiHalf so that we can blend them as i16s.
13646 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
13648 VLoHalf = DAG.getBitcast(
13649 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
13650 VHiHalf = DAG.getBitcast(
13651 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
13654 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
13655 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
13657 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
13660 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
13662 /// This routine breaks down the specific type of 128-bit shuffle and
13663 /// dispatches to the lowering routines accordingly.
13664 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13665 MVT VT, SDValue V1, SDValue V2,
13666 const APInt &Zeroable,
13667 const X86Subtarget &Subtarget,
13668 SelectionDAG &DAG) {
13669 switch (VT.SimpleTy) {
13671 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13673 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13675 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13677 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13679 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13681 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13684 llvm_unreachable("Unimplemented!");
13688 /// Generic routine to split vector shuffle into half-sized shuffles.
13690 /// This routine just extracts two subvectors, shuffles them independently, and
13691 /// then concatenates them back together. This should work effectively with all
13692 /// AVX vector shuffle types.
13693 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
13694 SDValue V2, ArrayRef<int> Mask,
13695 SelectionDAG &DAG) {
13696 assert(VT.getSizeInBits() >= 256 &&
13697 "Only for 256-bit or wider vector shuffles!");
13698 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
13699 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
13701 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
13702 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
13704 int NumElements = VT.getVectorNumElements();
13705 int SplitNumElements = NumElements / 2;
13706 MVT ScalarVT = VT.getVectorElementType();
13707 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
13709 // Rather than splitting build-vectors, just build two narrower build
13710 // vectors. This helps shuffling with splats and zeros.
13711 auto SplitVector = [&](SDValue V) {
13712 V = peekThroughBitcasts(V);
13714 MVT OrigVT = V.getSimpleValueType();
13715 int OrigNumElements = OrigVT.getVectorNumElements();
13716 int OrigSplitNumElements = OrigNumElements / 2;
13717 MVT OrigScalarVT = OrigVT.getVectorElementType();
13718 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
13722 auto *BV = dyn_cast<BuildVectorSDNode>(V);
13724 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
13725 DAG.getIntPtrConstant(0, DL));
13726 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
13727 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
13730 SmallVector<SDValue, 16> LoOps, HiOps;
13731 for (int i = 0; i < OrigSplitNumElements; ++i) {
13732 LoOps.push_back(BV->getOperand(i));
13733 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
13735 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
13736 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
13738 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
13739 DAG.getBitcast(SplitVT, HiV));
13742 SDValue LoV1, HiV1, LoV2, HiV2;
13743 std::tie(LoV1, HiV1) = SplitVector(V1);
13744 std::tie(LoV2, HiV2) = SplitVector(V2);
13746 // Now create two 4-way blends of these half-width vectors.
13747 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
13748 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
13749 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
13750 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
13751 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
13752 for (int i = 0; i < SplitNumElements; ++i) {
13753 int M = HalfMask[i];
13754 if (M >= NumElements) {
13755 if (M >= NumElements + SplitNumElements)
13759 V2BlendMask[i] = M - NumElements;
13760 BlendMask[i] = SplitNumElements + i;
13761 } else if (M >= 0) {
13762 if (M >= SplitNumElements)
13766 V1BlendMask[i] = M;
13771 // Because the lowering happens after all combining takes place, we need to
13772 // manually combine these blend masks as much as possible so that we create
13773 // a minimal number of high-level vector shuffle nodes.
13775 // First try just blending the halves of V1 or V2.
13776 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
13777 return DAG.getUNDEF(SplitVT);
13778 if (!UseLoV2 && !UseHiV2)
13779 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
13780 if (!UseLoV1 && !UseHiV1)
13781 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13783 SDValue V1Blend, V2Blend;
13784 if (UseLoV1 && UseHiV1) {
13786 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
13788 // We only use half of V1 so map the usage down into the final blend mask.
13789 V1Blend = UseLoV1 ? LoV1 : HiV1;
13790 for (int i = 0; i < SplitNumElements; ++i)
13791 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
13792 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
13794 if (UseLoV2 && UseHiV2) {
13796 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13798 // We only use half of V2 so map the usage down into the final blend mask.
13799 V2Blend = UseLoV2 ? LoV2 : HiV2;
13800 for (int i = 0; i < SplitNumElements; ++i)
13801 if (BlendMask[i] >= SplitNumElements)
13802 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
13804 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
13806 SDValue Lo = HalfBlend(LoMask);
13807 SDValue Hi = HalfBlend(HiMask);
13808 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13811 /// Either split a vector in halves or decompose the shuffles and the
13814 /// This is provided as a good fallback for many lowerings of non-single-input
13815 /// shuffles with more than one 128-bit lane. In those cases, we want to select
13816 /// between splitting the shuffle into 128-bit components and stitching those
13817 /// back together vs. extracting the single-input shuffles and blending those
13819 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
13820 SDValue V1, SDValue V2,
13821 ArrayRef<int> Mask,
13822 const X86Subtarget &Subtarget,
13823 SelectionDAG &DAG) {
13824 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
13825 "shuffles as it could then recurse on itself.");
13826 int Size = Mask.size();
13828 // If this can be modeled as a broadcast of two elements followed by a blend,
13829 // prefer that lowering. This is especially important because broadcasts can
13830 // often fold with memory operands.
13831 auto DoBothBroadcast = [&] {
13832 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
13835 if (V2BroadcastIdx < 0)
13836 V2BroadcastIdx = M - Size;
13837 else if (M - Size != V2BroadcastIdx)
13839 } else if (M >= 0) {
13840 if (V1BroadcastIdx < 0)
13841 V1BroadcastIdx = M;
13842 else if (M != V1BroadcastIdx)
13847 if (DoBothBroadcast())
13848 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
13851 // If the inputs all stem from a single 128-bit lane of each input, then we
13852 // split them rather than blending because the split will decompose to
13853 // unusually few instructions.
13854 int LaneCount = VT.getSizeInBits() / 128;
13855 int LaneSize = Size / LaneCount;
13856 SmallBitVector LaneInputs[2];
13857 LaneInputs[0].resize(LaneCount, false);
13858 LaneInputs[1].resize(LaneCount, false);
13859 for (int i = 0; i < Size; ++i)
13861 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
13862 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
13863 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13865 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
13866 // that the decomposed single-input shuffles don't end up here.
13867 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
13871 /// Lower a vector shuffle crossing multiple 128-bit lanes as
13872 /// a lane permutation followed by a per-lane permutation.
13874 /// This is mainly for cases where we can have non-repeating permutes
13877 /// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
13878 /// we should investigate merging them.
13879 static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
13880 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13881 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
13882 int NumElts = VT.getVectorNumElements();
13883 int NumLanes = VT.getSizeInBits() / 128;
13884 int NumEltsPerLane = NumElts / NumLanes;
13886 SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
13887 SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
13888 SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
13890 for (int i = 0; i != NumElts; ++i) {
13895 // Ensure that each lane comes from a single source lane.
13896 int SrcLane = M / NumEltsPerLane;
13897 int DstLane = i / NumEltsPerLane;
13898 if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
13900 SrcLaneMask[DstLane] = SrcLane;
13902 LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
13903 PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
13906 // If we're only shuffling a single lowest lane and the rest are identity
13907 // then don't bother.
13908 // TODO - isShuffleMaskInputInPlace could be extended to something like this.
13909 int NumIdentityLanes = 0;
13910 bool OnlyShuffleLowestLane = true;
13911 for (int i = 0; i != NumLanes; ++i) {
13912 if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
13913 i * NumEltsPerLane))
13914 NumIdentityLanes++;
13915 else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
13916 OnlyShuffleLowestLane = false;
13918 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
13921 SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
13922 return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
13925 /// Lower a vector shuffle crossing multiple 128-bit lanes as
13926 /// a permutation and blend of those lanes.
13928 /// This essentially blends the out-of-lane inputs to each lane into the lane
13929 /// from a permuted copy of the vector. This lowering strategy results in four
13930 /// instructions in the worst case for a single-input cross lane shuffle which
13931 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
13932 /// of. Special cases for each particular shuffle pattern should be handled
13933 /// prior to trying this lowering.
13934 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
13935 SDValue V1, SDValue V2,
13936 ArrayRef<int> Mask,
13938 const X86Subtarget &Subtarget) {
13939 // FIXME: This should probably be generalized for 512-bit vectors as well.
13940 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
13941 int Size = Mask.size();
13942 int LaneSize = Size / 2;
13944 // If there are only inputs from one 128-bit lane, splitting will in fact be
13945 // less expensive. The flags track whether the given lane contains an element
13946 // that crosses to another lane.
13947 if (!Subtarget.hasAVX2()) {
13948 bool LaneCrossing[2] = {false, false};
13949 for (int i = 0; i < Size; ++i)
13950 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
13951 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
13952 if (!LaneCrossing[0] || !LaneCrossing[1])
13953 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13955 bool LaneUsed[2] = {false, false};
13956 for (int i = 0; i < Size; ++i)
13958 LaneUsed[(Mask[i] / LaneSize)] = true;
13959 if (!LaneUsed[0] || !LaneUsed[1])
13960 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13963 assert(V2.isUndef() &&
13964 "This last part of this routine only works on single input shuffles");
13966 SmallVector<int, 32> FlippedBlendMask(Size);
13967 for (int i = 0; i < Size; ++i)
13968 FlippedBlendMask[i] =
13969 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
13971 : Mask[i] % LaneSize +
13972 (i / LaneSize) * LaneSize + Size);
13974 // Flip the vector, and blend the results which should now be in-lane.
13975 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
13976 SDValue Flipped = DAG.getBitcast(PVT, V1);
13977 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
13979 Flipped = DAG.getBitcast(VT, Flipped);
13980 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
13983 /// Handle lowering 2-lane 128-bit shuffles.
13984 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
13985 SDValue V2, ArrayRef<int> Mask,
13986 const APInt &Zeroable,
13987 const X86Subtarget &Subtarget,
13988 SelectionDAG &DAG) {
13989 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
13990 if (Subtarget.hasAVX2() && V2.isUndef())
13993 SmallVector<int, 4> WidenedMask;
13994 if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
13997 bool IsLowZero = (Zeroable & 0x3) == 0x3;
13998 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14000 // Try to use an insert into a zero vector.
14001 if (WidenedMask[0] == 0 && IsHighZero) {
14002 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14003 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14004 DAG.getIntPtrConstant(0, DL));
14005 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14006 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14007 DAG.getIntPtrConstant(0, DL));
14010 // TODO: If minimizing size and one of the inputs is a zero vector and the
14011 // the zero vector has only one use, we could use a VPERM2X128 to save the
14012 // instruction bytes needed to explicitly generate the zero vector.
14014 // Blends are faster and handle all the non-lane-crossing cases.
14015 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
14016 Zeroable, Subtarget, DAG))
14019 // If either input operand is a zero vector, use VPERM2X128 because its mask
14020 // allows us to replace the zero input with an implicit zero.
14021 if (!IsLowZero && !IsHighZero) {
14022 // Check for patterns which can be matched with a single insert of a 128-bit
14024 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
14025 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
14027 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14028 // this will likely become vinsertf128 which can't fold a 256-bit memop.
14029 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14030 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14031 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14032 OnlyUsesV1 ? V1 : V2,
14033 DAG.getIntPtrConstant(0, DL));
14034 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14035 DAG.getIntPtrConstant(2, DL));
14039 // Try to use SHUF128 if possible.
14040 if (Subtarget.hasVLX()) {
14041 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
14042 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
14043 ((WidenedMask[1] % 2) << 1);
14044 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
14045 DAG.getConstant(PermMask, DL, MVT::i8));
14050 // Otherwise form a 128-bit permutation. After accounting for undefs,
14051 // convert the 64-bit shuffle mask selection values into 128-bit
14052 // selection bits by dividing the indexes by 2 and shifting into positions
14053 // defined by a vperm2*128 instruction's immediate control byte.
14055 // The immediate permute control byte looks like this:
14056 // [1:0] - select 128 bits from sources for low half of destination
14058 // [3] - zero low half of destination
14059 // [5:4] - select 128 bits from sources for high half of destination
14061 // [7] - zero high half of destination
14063 assert((WidenedMask[0] >= 0 || IsLowZero) &&
14064 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
14066 unsigned PermMask = 0;
14067 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
14068 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
14070 // Check the immediate mask and replace unused sources with undef.
14071 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
14072 V1 = DAG.getUNDEF(VT);
14073 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
14074 V2 = DAG.getUNDEF(VT);
14076 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
14077 DAG.getConstant(PermMask, DL, MVT::i8));
14080 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
14081 /// shuffling each lane.
14083 /// This attempts to create a repeated lane shuffle where each lane uses one
14084 /// or two of the lanes of the inputs. The lanes of the input vectors are
14085 /// shuffled in one or two independent shuffles to get the lanes into the
14086 /// position needed by the final shuffle.
14088 /// FIXME: This should be generalized to 512-bit shuffles.
14089 static SDValue lowerVectorShuffleByMerging128BitLanes(
14090 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14091 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14092 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
14094 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
14097 int Size = Mask.size();
14098 int LaneSize = 128 / VT.getScalarSizeInBits();
14099 int NumLanes = Size / LaneSize;
14100 assert(NumLanes == 2 && "Only handles 256-bit shuffles.");
14102 SmallVector<int, 16> RepeatMask(LaneSize, -1);
14103 int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } };
14105 // First pass will try to fill in the RepeatMask from lanes that need two
14107 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14108 int Srcs[2] = { -1, -1 };
14109 SmallVector<int, 16> InLaneMask(LaneSize, -1);
14110 for (int i = 0; i != LaneSize; ++i) {
14111 int M = Mask[(Lane * LaneSize) + i];
14114 // Determine which of the 4 possible input lanes (2 from each source)
14115 // this element comes from. Assign that as one of the sources for this
14116 // lane. We can assign up to 2 sources for this lane. If we run out
14117 // sources we can't do anything.
14118 int LaneSrc = M / LaneSize;
14120 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
14122 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
14127 Srcs[Src] = LaneSrc;
14128 InLaneMask[i] = (M % LaneSize) + Src * Size;
14131 // If this lane has two sources, see if it fits with the repeat mask so far.
14135 LaneSrcs[Lane][0] = Srcs[0];
14136 LaneSrcs[Lane][1] = Srcs[1];
14138 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
14139 assert(M1.size() == M2.size() && "Unexpected mask size");
14140 for (int i = 0, e = M1.size(); i != e; ++i)
14141 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
14146 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
14147 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
14148 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
14152 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
14153 "Unexpected mask element");
14158 if (MatchMasks(InLaneMask, RepeatMask)) {
14159 // Merge this lane mask into the final repeat mask.
14160 MergeMasks(InLaneMask, RepeatMask);
14164 // Didn't find a match. Swap the operands and try again.
14165 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
14166 ShuffleVectorSDNode::commuteMask(InLaneMask);
14168 if (MatchMasks(InLaneMask, RepeatMask)) {
14169 // Merge this lane mask into the final repeat mask.
14170 MergeMasks(InLaneMask, RepeatMask);
14174 // Couldn't find a match with the operands in either order.
14178 // Now handle any lanes with only one source.
14179 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14180 // If this lane has already been processed, skip it.
14181 if (LaneSrcs[Lane][0] >= 0)
14184 for (int i = 0; i != LaneSize; ++i) {
14185 int M = Mask[(Lane * LaneSize) + i];
14189 // If RepeatMask isn't defined yet we can define it ourself.
14190 if (RepeatMask[i] < 0)
14191 RepeatMask[i] = M % LaneSize;
14193 if (RepeatMask[i] < Size) {
14194 if (RepeatMask[i] != M % LaneSize)
14196 LaneSrcs[Lane][0] = M / LaneSize;
14198 if (RepeatMask[i] != ((M % LaneSize) + Size))
14200 LaneSrcs[Lane][1] = M / LaneSize;
14204 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
14208 SmallVector<int, 16> NewMask(Size, -1);
14209 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14210 int Src = LaneSrcs[Lane][0];
14211 for (int i = 0; i != LaneSize; ++i) {
14214 M = Src * LaneSize + i;
14215 NewMask[Lane * LaneSize + i] = M;
14218 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14219 // Ensure we didn't get back the shuffle we started with.
14220 // FIXME: This is a hack to make up for some splat handling code in
14221 // getVectorShuffle.
14222 if (isa<ShuffleVectorSDNode>(NewV1) &&
14223 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
14226 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14227 int Src = LaneSrcs[Lane][1];
14228 for (int i = 0; i != LaneSize; ++i) {
14231 M = Src * LaneSize + i;
14232 NewMask[Lane * LaneSize + i] = M;
14235 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14236 // Ensure we didn't get back the shuffle we started with.
14237 // FIXME: This is a hack to make up for some splat handling code in
14238 // getVectorShuffle.
14239 if (isa<ShuffleVectorSDNode>(NewV2) &&
14240 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
14243 for (int i = 0; i != Size; ++i) {
14244 NewMask[i] = RepeatMask[i % LaneSize];
14245 if (NewMask[i] < 0)
14248 NewMask[i] += (i / LaneSize) * LaneSize;
14250 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
14253 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
14254 /// This allows for fast cases such as subvector extraction/insertion
14255 /// or shuffling smaller vector types which can lower more efficiently.
14256 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
14257 SDValue V1, SDValue V2,
14258 ArrayRef<int> Mask,
14259 const X86Subtarget &Subtarget,
14260 SelectionDAG &DAG) {
14261 assert((VT.is256BitVector() || VT.is512BitVector()) &&
14262 "Expected 256-bit or 512-bit vector");
14264 unsigned NumElts = VT.getVectorNumElements();
14265 unsigned HalfNumElts = NumElts / 2;
14266 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
14268 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
14269 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
14270 if (!UndefLower && !UndefUpper)
14273 // Upper half is undef and lower half is whole upper subvector.
14274 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
14276 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
14277 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
14278 DAG.getIntPtrConstant(HalfNumElts, DL));
14279 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
14280 DAG.getIntPtrConstant(0, DL));
14283 // Lower half is undef and upper half is whole lower subvector.
14284 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
14286 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
14287 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
14288 DAG.getIntPtrConstant(0, DL));
14289 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
14290 DAG.getIntPtrConstant(HalfNumElts, DL));
14293 // If the shuffle only uses two of the four halves of the input operands,
14294 // then extract them and perform the 'half' shuffle at half width.
14295 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
14296 int HalfIdx1 = -1, HalfIdx2 = -1;
14297 SmallVector<int, 8> HalfMask(HalfNumElts);
14298 unsigned Offset = UndefLower ? HalfNumElts : 0;
14299 for (unsigned i = 0; i != HalfNumElts; ++i) {
14300 int M = Mask[i + Offset];
14306 // Determine which of the 4 half vectors this element is from.
14307 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
14308 int HalfIdx = M / HalfNumElts;
14310 // Determine the element index into its half vector source.
14311 int HalfElt = M % HalfNumElts;
14313 // We can shuffle with up to 2 half vectors, set the new 'half'
14314 // shuffle mask accordingly.
14315 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
14316 HalfMask[i] = HalfElt;
14317 HalfIdx1 = HalfIdx;
14320 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
14321 HalfMask[i] = HalfElt + HalfNumElts;
14322 HalfIdx2 = HalfIdx;
14326 // Too many half vectors referenced.
14329 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
14331 // Only shuffle the halves of the inputs when useful.
14332 int NumLowerHalves =
14333 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
14334 int NumUpperHalves =
14335 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
14337 // uuuuXXXX - don't extract uppers just to insert again.
14338 if (UndefLower && NumUpperHalves != 0)
14341 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
14342 if (UndefUpper && NumUpperHalves == 2)
14345 // AVX2 - XXXXuuuu - always extract lowers.
14346 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
14347 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
14348 if (VT == MVT::v4f64 || VT == MVT::v4i64)
14350 // AVX2 supports variable 32-bit element cross-lane shuffles.
14351 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
14352 // XXXXuuuu - don't extract lowers and uppers.
14353 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
14358 // AVX512 - XXXXuuuu - always extract lowers.
14359 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
14362 auto GetHalfVector = [&](int HalfIdx) {
14364 return DAG.getUNDEF(HalfVT);
14365 SDValue V = (HalfIdx < 2 ? V1 : V2);
14366 HalfIdx = (HalfIdx % 2) * HalfNumElts;
14367 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
14368 DAG.getIntPtrConstant(HalfIdx, DL));
14371 SDValue Half1 = GetHalfVector(HalfIdx1);
14372 SDValue Half2 = GetHalfVector(HalfIdx2);
14373 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
14374 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
14375 DAG.getIntPtrConstant(Offset, DL));
14378 /// Test whether the specified input (0 or 1) is in-place blended by the
14381 /// This returns true if the elements from a particular input are already in the
14382 /// slot required by the given mask and require no permutation.
14383 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
14384 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
14385 int Size = Mask.size();
14386 for (int i = 0; i < Size; ++i)
14387 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
14393 /// Handle case where shuffle sources are coming from the same 128-bit lane and
14394 /// every lane can be represented as the same repeating mask - allowing us to
14395 /// shuffle the sources with the repeating shuffle and then permute the result
14396 /// to the destination lanes.
14397 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
14398 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14399 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14400 int NumElts = VT.getVectorNumElements();
14401 int NumLanes = VT.getSizeInBits() / 128;
14402 int NumLaneElts = NumElts / NumLanes;
14404 // On AVX2 we may be able to just shuffle the lowest elements and then
14405 // broadcast the result.
14406 if (Subtarget.hasAVX2()) {
14407 for (unsigned BroadcastSize : {16, 32, 64}) {
14408 if (BroadcastSize <= VT.getScalarSizeInBits())
14410 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
14412 // Attempt to match a repeating pattern every NumBroadcastElts,
14413 // accounting for UNDEFs but only references the lowest 128-bit
14414 // lane of the inputs.
14415 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
14416 for (int i = 0; i != NumElts; i += NumBroadcastElts)
14417 for (int j = 0; j != NumBroadcastElts; ++j) {
14418 int M = Mask[i + j];
14421 int &R = RepeatMask[j];
14422 if (0 != ((M % NumElts) / NumLaneElts))
14424 if (0 <= R && R != M)
14431 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
14432 if (!FindRepeatingBroadcastMask(RepeatMask))
14435 // Shuffle the (lowest) repeated elements in place for broadcast.
14436 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
14438 // Shuffle the actual broadcast.
14439 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
14440 for (int i = 0; i != NumElts; i += NumBroadcastElts)
14441 for (int j = 0; j != NumBroadcastElts; ++j)
14442 BroadcastMask[i + j] = j;
14443 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
14448 // Bail if the shuffle mask doesn't cross 128-bit lanes.
14449 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
14452 // Bail if we already have a repeated lane shuffle mask.
14453 SmallVector<int, 8> RepeatedShuffleMask;
14454 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
14457 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
14458 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
14459 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
14460 int NumSubLanes = NumLanes * SubLaneScale;
14461 int NumSubLaneElts = NumLaneElts / SubLaneScale;
14463 // Check that all the sources are coming from the same lane and see if we can
14464 // form a repeating shuffle mask (local to each sub-lane). At the same time,
14465 // determine the source sub-lane for each destination sub-lane.
14466 int TopSrcSubLane = -1;
14467 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
14468 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
14469 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
14470 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
14472 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
14473 // Extract the sub-lane mask, check that it all comes from the same lane
14474 // and normalize the mask entries to come from the first lane.
14476 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
14477 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
14478 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
14481 int Lane = (M % NumElts) / NumLaneElts;
14482 if ((0 <= SrcLane) && (SrcLane != Lane))
14485 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
14486 SubLaneMask[Elt] = LocalM;
14489 // Whole sub-lane is UNDEF.
14493 // Attempt to match against the candidate repeated sub-lane masks.
14494 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
14495 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
14496 for (int i = 0; i != NumSubLaneElts; ++i) {
14497 if (M1[i] < 0 || M2[i] < 0)
14499 if (M1[i] != M2[i])
14505 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
14506 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
14509 // Merge the sub-lane mask into the matching repeated sub-lane mask.
14510 for (int i = 0; i != NumSubLaneElts; ++i) {
14511 int M = SubLaneMask[i];
14514 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
14515 "Unexpected mask element");
14516 RepeatedSubLaneMask[i] = M;
14519 // Track the top most source sub-lane - by setting the remaining to UNDEF
14520 // we can greatly simplify shuffle matching.
14521 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
14522 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
14523 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
14527 // Bail if we failed to find a matching repeated sub-lane mask.
14528 if (Dst2SrcSubLanes[DstSubLane] < 0)
14531 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
14532 "Unexpected source lane");
14534 // Create a repeating shuffle mask for the entire vector.
14535 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
14536 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
14537 int Lane = SubLane / SubLaneScale;
14538 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
14539 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
14540 int M = RepeatedSubLaneMask[Elt];
14543 int Idx = (SubLane * NumSubLaneElts) + Elt;
14544 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
14547 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
14549 // Shuffle each source sub-lane to its destination.
14550 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
14551 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
14552 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
14553 if (SrcSubLane < 0)
14555 for (int j = 0; j != NumSubLaneElts; ++j)
14556 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
14559 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
14563 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
14564 unsigned &ShuffleImm,
14565 ArrayRef<int> Mask) {
14566 int NumElts = VT.getVectorNumElements();
14567 assert(VT.getScalarSizeInBits() == 64 &&
14568 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
14569 "Unexpected data type for VSHUFPD");
14571 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
14572 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
14574 bool ShufpdMask = true;
14575 bool CommutableMask = true;
14576 for (int i = 0; i < NumElts; ++i) {
14577 if (Mask[i] == SM_SentinelUndef)
14581 int Val = (i & 6) + NumElts * (i & 1);
14582 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
14583 if (Mask[i] < Val || Mask[i] > Val + 1)
14584 ShufpdMask = false;
14585 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
14586 CommutableMask = false;
14587 ShuffleImm |= (Mask[i] % 2) << i;
14592 if (CommutableMask) {
14600 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
14601 ArrayRef<int> Mask, SDValue V1,
14602 SDValue V2, SelectionDAG &DAG) {
14603 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
14604 "Unexpected data type for VSHUFPD");
14606 unsigned Immediate = 0;
14607 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
14610 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14611 DAG.getConstant(Immediate, DL, MVT::i8));
14614 /// Handle lowering of 4-lane 64-bit floating point shuffles.
14616 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
14617 /// isn't available.
14618 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14619 const APInt &Zeroable,
14620 SDValue V1, SDValue V2,
14621 const X86Subtarget &Subtarget,
14622 SelectionDAG &DAG) {
14623 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
14624 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
14625 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14627 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
14628 Zeroable, Subtarget, DAG))
14631 if (V2.isUndef()) {
14632 // Check for being able to broadcast a single element.
14633 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
14634 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
14637 // Use low duplicate instructions for masks that match their pattern.
14638 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
14639 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
14641 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
14642 // Non-half-crossing single input shuffles can be lowered with an
14643 // interleaved permutation.
14644 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
14645 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
14646 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
14647 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
14650 // With AVX2 we have direct support for this permutation.
14651 if (Subtarget.hasAVX2())
14652 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
14653 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14655 // Try to create an in-lane repeating shuffle mask and then shuffle the
14656 // results into the target lanes.
14657 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14658 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
14661 // Try to permute the lanes and then use a per-lane permute.
14662 if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
14663 DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
14666 // Otherwise, fall back.
14667 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
14671 // Use dedicated unpack instructions for masks that match their pattern.
14673 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
14676 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
14677 Zeroable, Subtarget, DAG))
14680 // Check if the blend happens to exactly fit that of SHUFPD.
14682 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
14685 // Try to create an in-lane repeating shuffle mask and then shuffle the
14686 // results into the target lanes.
14687 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14688 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
14691 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14692 // shuffle. However, if we have AVX2 and either inputs are already in place,
14693 // we will be able to shuffle even across lanes the other input in a single
14694 // instruction so skip this pattern.
14695 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
14696 isShuffleMaskInputInPlace(1, Mask))))
14697 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14698 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
14701 // If we have VLX support, we can use VEXPAND.
14702 if (Subtarget.hasVLX())
14703 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
14704 V1, V2, DAG, Subtarget))
14707 // If we have AVX2 then we always want to lower with a blend because an v4 we
14708 // can fully permute the elements.
14709 if (Subtarget.hasAVX2())
14710 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
14711 Mask, Subtarget, DAG);
14713 // Otherwise fall back on generic lowering.
14714 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
14718 /// Handle lowering of 4-lane 64-bit integer shuffles.
14720 /// This routine is only called when we have AVX2 and thus a reasonable
14721 /// instruction set for v4i64 shuffling..
14722 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14723 const APInt &Zeroable,
14724 SDValue V1, SDValue V2,
14725 const X86Subtarget &Subtarget,
14726 SelectionDAG &DAG) {
14727 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
14728 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
14729 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14730 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
14732 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
14733 Zeroable, Subtarget, DAG))
14736 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
14737 Zeroable, Subtarget, DAG))
14740 // Check for being able to broadcast a single element.
14741 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
14742 Mask, Subtarget, DAG))
14745 if (V2.isUndef()) {
14746 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
14747 // can use lower latency instructions that will operate on both lanes.
14748 SmallVector<int, 2> RepeatedMask;
14749 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
14750 SmallVector<int, 4> PSHUFDMask;
14751 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
14752 return DAG.getBitcast(
14754 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
14755 DAG.getBitcast(MVT::v8i32, V1),
14756 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14759 // AVX2 provides a direct instruction for permuting a single input across
14761 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
14762 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14765 // Try to use shift instructions.
14766 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
14767 Zeroable, Subtarget, DAG))
14770 // If we have VLX support, we can use VALIGN or VEXPAND.
14771 if (Subtarget.hasVLX()) {
14772 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
14773 Mask, Subtarget, DAG))
14776 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
14777 V1, V2, DAG, Subtarget))
14781 // Try to use PALIGNR.
14782 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
14783 Mask, Subtarget, DAG))
14786 // Use dedicated unpack instructions for masks that match their pattern.
14788 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
14791 // Try to create an in-lane repeating shuffle mask and then shuffle the
14792 // results into the target lanes.
14793 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14794 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
14797 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14798 // shuffle. However, if we have AVX2 and either inputs are already in place,
14799 // we will be able to shuffle even across lanes the other input in a single
14800 // instruction so skip this pattern.
14801 if (!isShuffleMaskInputInPlace(0, Mask) &&
14802 !isShuffleMaskInputInPlace(1, Mask))
14803 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14804 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
14807 // Otherwise fall back on generic blend lowering.
14808 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
14809 Mask, Subtarget, DAG);
14812 /// Handle lowering of 8-lane 32-bit floating point shuffles.
14814 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
14815 /// isn't available.
14816 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14817 const APInt &Zeroable,
14818 SDValue V1, SDValue V2,
14819 const X86Subtarget &Subtarget,
14820 SelectionDAG &DAG) {
14821 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
14822 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
14823 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14825 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
14826 Zeroable, Subtarget, DAG))
14829 // Check for being able to broadcast a single element.
14830 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
14831 Mask, Subtarget, DAG))
14834 // If the shuffle mask is repeated in each 128-bit lane, we have many more
14835 // options to efficiently lower the shuffle.
14836 SmallVector<int, 4> RepeatedMask;
14837 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
14838 assert(RepeatedMask.size() == 4 &&
14839 "Repeated masks must be half the mask width!");
14841 // Use even/odd duplicate instructions for masks that match their pattern.
14842 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
14843 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
14844 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
14845 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
14848 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
14849 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14851 // Use dedicated unpack instructions for masks that match their pattern.
14853 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
14856 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
14857 // have already handled any direct blends.
14858 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
14861 // Try to create an in-lane repeating shuffle mask and then shuffle the
14862 // results into the target lanes.
14863 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14864 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
14867 // If we have a single input shuffle with different shuffle patterns in the
14868 // two 128-bit lanes use the variable mask to VPERMILPS.
14869 if (V2.isUndef()) {
14870 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
14871 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
14872 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
14874 if (Subtarget.hasAVX2())
14875 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
14877 // Otherwise, fall back.
14878 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
14882 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14884 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14885 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
14887 // If we have VLX support, we can use VEXPAND.
14888 if (Subtarget.hasVLX())
14889 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
14890 V1, V2, DAG, Subtarget))
14893 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
14894 // since after split we get a more efficient code using vpunpcklwd and
14895 // vpunpckhwd instrs than vblend.
14896 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
14897 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
14898 Mask, Subtarget, DAG))
14901 // If we have AVX2 then we always want to lower with a blend because at v8 we
14902 // can fully permute the elements.
14903 if (Subtarget.hasAVX2())
14904 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
14905 Mask, Subtarget, DAG);
14907 // Otherwise fall back on generic lowering.
14908 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
14912 /// Handle lowering of 8-lane 32-bit integer shuffles.
14914 /// This routine is only called when we have AVX2 and thus a reasonable
14915 /// instruction set for v8i32 shuffling..
14916 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14917 const APInt &Zeroable,
14918 SDValue V1, SDValue V2,
14919 const X86Subtarget &Subtarget,
14920 SelectionDAG &DAG) {
14921 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
14922 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
14923 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14924 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
14926 // Whenever we can lower this as a zext, that instruction is strictly faster
14927 // than any alternative. It also allows us to fold memory operands into the
14928 // shuffle in many cases.
14929 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14930 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14933 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
14934 // since after split we get a more efficient code than vblend by using
14935 // vpunpcklwd and vpunpckhwd instrs.
14936 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
14937 !Subtarget.hasAVX512())
14938 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2,
14939 Mask, Subtarget, DAG))
14942 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
14943 Zeroable, Subtarget, DAG))
14946 // Check for being able to broadcast a single element.
14947 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
14948 Mask, Subtarget, DAG))
14951 // If the shuffle mask is repeated in each 128-bit lane we can use more
14952 // efficient instructions that mirror the shuffles across the two 128-bit
14954 SmallVector<int, 4> RepeatedMask;
14955 bool Is128BitLaneRepeatedShuffle =
14956 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
14957 if (Is128BitLaneRepeatedShuffle) {
14958 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14960 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
14961 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14963 // Use dedicated unpack instructions for masks that match their pattern.
14965 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
14969 // Try to use shift instructions.
14970 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
14971 Zeroable, Subtarget, DAG))
14974 // If we have VLX support, we can use VALIGN or EXPAND.
14975 if (Subtarget.hasVLX()) {
14976 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
14977 Mask, Subtarget, DAG))
14980 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
14981 V1, V2, DAG, Subtarget))
14985 // Try to use byte rotation instructions.
14986 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14987 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14990 // Try to create an in-lane repeating shuffle mask and then shuffle the
14991 // results into the target lanes.
14992 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14993 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14996 // If the shuffle patterns aren't repeated but it is a single input, directly
14997 // generate a cross-lane VPERMD instruction.
14998 if (V2.isUndef()) {
14999 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
15000 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
15003 // Assume that a single SHUFPS is faster than an alternative sequence of
15004 // multiple instructions (even if the CPU has a domain penalty).
15005 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15006 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
15007 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
15008 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
15009 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
15010 CastV1, CastV2, DAG);
15011 return DAG.getBitcast(MVT::v8i32, ShufPS);
15014 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15016 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
15017 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
15020 // Otherwise fall back on generic blend lowering.
15021 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
15022 Mask, Subtarget, DAG);
15025 /// Handle lowering of 16-lane 16-bit integer shuffles.
15027 /// This routine is only called when we have AVX2 and thus a reasonable
15028 /// instruction set for v16i16 shuffling..
15029 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15030 const APInt &Zeroable,
15031 SDValue V1, SDValue V2,
15032 const X86Subtarget &Subtarget,
15033 SelectionDAG &DAG) {
15034 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
15035 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
15036 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15037 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
15039 // Whenever we can lower this as a zext, that instruction is strictly faster
15040 // than any alternative. It also allows us to fold memory operands into the
15041 // shuffle in many cases.
15042 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
15043 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15046 // Check for being able to broadcast a single element.
15047 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
15048 Mask, Subtarget, DAG))
15051 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
15052 Zeroable, Subtarget, DAG))
15055 // Use dedicated unpack instructions for masks that match their pattern.
15057 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
15060 // Use dedicated pack instructions for masks that match their pattern.
15061 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
15065 // Try to use shift instructions.
15066 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
15067 Zeroable, Subtarget, DAG))
15070 // Try to use byte rotation instructions.
15071 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
15072 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
15075 // Try to create an in-lane repeating shuffle mask and then shuffle the
15076 // results into the target lanes.
15077 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15078 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
15081 if (V2.isUndef()) {
15082 // There are no generalized cross-lane shuffle operations available on i16
15084 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
15085 if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
15086 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
15089 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
15090 Mask, DAG, Subtarget);
15093 SmallVector<int, 8> RepeatedMask;
15094 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
15095 // As this is a single-input shuffle, the repeated mask should be
15096 // a strictly valid v8i16 mask that we can pass through to the v8i16
15097 // lowering to handle even the v16 case.
15098 return lowerV8I16GeneralSingleInputVectorShuffle(
15099 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
15103 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
15104 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
15107 // AVX512BWVL can lower to VPERMW.
15108 if (Subtarget.hasBWI() && Subtarget.hasVLX())
15109 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
15111 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15113 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
15114 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
15117 // Try to permute the lanes and then use a per-lane permute.
15118 if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
15119 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
15122 // Otherwise fall back on generic lowering.
15123 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
15127 /// Handle lowering of 32-lane 8-bit integer shuffles.
15129 /// This routine is only called when we have AVX2 and thus a reasonable
15130 /// instruction set for v32i8 shuffling..
15131 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15132 const APInt &Zeroable,
15133 SDValue V1, SDValue V2,
15134 const X86Subtarget &Subtarget,
15135 SelectionDAG &DAG) {
15136 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
15137 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
15138 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
15139 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
15141 // Whenever we can lower this as a zext, that instruction is strictly faster
15142 // than any alternative. It also allows us to fold memory operands into the
15143 // shuffle in many cases.
15144 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
15145 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15148 // Check for being able to broadcast a single element.
15149 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
15150 Mask, Subtarget, DAG))
15153 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
15154 Zeroable, Subtarget, DAG))
15157 // Use dedicated unpack instructions for masks that match their pattern.
15159 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
15162 // Use dedicated pack instructions for masks that match their pattern.
15163 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
15167 // Try to use shift instructions.
15168 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
15169 Zeroable, Subtarget, DAG))
15172 // Try to use byte rotation instructions.
15173 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
15174 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
15177 // Try to create an in-lane repeating shuffle mask and then shuffle the
15178 // results into the target lanes.
15179 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15180 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
15183 // There are no generalized cross-lane shuffle operations available on i8
15185 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
15186 if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
15187 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
15190 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
15194 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
15195 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
15198 // AVX512VBMIVL can lower to VPERMB.
15199 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
15200 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
15202 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15204 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
15205 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
15208 // Try to permute the lanes and then use a per-lane permute.
15209 if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
15210 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
15213 // Otherwise fall back on generic lowering.
15214 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
15218 /// High-level routine to lower various 256-bit x86 vector shuffles.
15220 /// This routine either breaks down the specific type of a 256-bit x86 vector
15221 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
15222 /// together based on the available instructions.
15223 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15224 MVT VT, SDValue V1, SDValue V2,
15225 const APInt &Zeroable,
15226 const X86Subtarget &Subtarget,
15227 SelectionDAG &DAG) {
15228 // If we have a single input to the zero element, insert that into V1 if we
15229 // can do so cheaply.
15230 int NumElts = VT.getVectorNumElements();
15231 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
15233 if (NumV2Elements == 1 && Mask[0] >= NumElts)
15234 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
15235 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
15238 // Handle special cases where the lower or upper half is UNDEF.
15240 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
15243 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
15244 // can check for those subtargets here and avoid much of the subtarget
15245 // querying in the per-vector-type lowering routines. With AVX1 we have
15246 // essentially *zero* ability to manipulate a 256-bit vector with integer
15247 // types. Since we'll use floating point types there eventually, just
15248 // immediately cast everything to a float and operate entirely in that domain.
15249 if (VT.isInteger() && !Subtarget.hasAVX2()) {
15250 int ElementBits = VT.getScalarSizeInBits();
15251 if (ElementBits < 32) {
15252 // No floating point type available, if we can't use the bit operations
15253 // for masking/blending then decompose into 128-bit vectors.
15255 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
15257 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
15259 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
15262 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
15263 VT.getVectorNumElements());
15264 V1 = DAG.getBitcast(FpVT, V1);
15265 V2 = DAG.getBitcast(FpVT, V2);
15266 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
15269 switch (VT.SimpleTy) {
15271 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15273 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15275 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15277 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15279 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15281 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15284 llvm_unreachable("Not a valid 256-bit x86 vector type!");
15288 /// Try to lower a vector shuffle as a 128-bit shuffles.
15289 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
15290 ArrayRef<int> Mask,
15291 const APInt &Zeroable,
15292 SDValue V1, SDValue V2,
15293 const X86Subtarget &Subtarget,
15294 SelectionDAG &DAG) {
15295 assert(VT.getScalarSizeInBits() == 64 &&
15296 "Unexpected element type size for 128bit shuffle.");
15298 // To handle 256 bit vector requires VLX and most probably
15299 // function lowerV2X128VectorShuffle() is better solution.
15300 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
15302 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
15303 SmallVector<int, 4> WidenedMask;
15304 if (!canWidenShuffleElements(Mask, WidenedMask))
15307 // Try to use an insert into a zero vector.
15308 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
15309 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
15310 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
15311 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
15312 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15313 DAG.getIntPtrConstant(0, DL));
15314 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15315 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15316 DAG.getIntPtrConstant(0, DL));
15319 // Check for patterns which can be matched with a single insert of a 256-bit
15321 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
15322 {0, 1, 2, 3, 0, 1, 2, 3});
15323 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
15324 {0, 1, 2, 3, 8, 9, 10, 11})) {
15325 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
15326 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
15327 OnlyUsesV1 ? V1 : V2,
15328 DAG.getIntPtrConstant(0, DL));
15329 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15330 DAG.getIntPtrConstant(4, DL));
15333 assert(WidenedMask.size() == 4);
15335 // See if this is an insertion of the lower 128-bits of V2 into V1.
15336 bool IsInsert = true;
15338 for (int i = 0; i < 4; ++i) {
15339 assert(WidenedMask[i] >= -1);
15340 if (WidenedMask[i] < 0)
15343 // Make sure all V1 subvectors are in place.
15344 if (WidenedMask[i] < 4) {
15345 if (WidenedMask[i] != i) {
15350 // Make sure we only have a single V2 index and its the lowest 128-bits.
15351 if (V2Index >= 0 || WidenedMask[i] != 4) {
15358 if (IsInsert && V2Index >= 0) {
15359 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15360 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
15361 DAG.getIntPtrConstant(0, DL));
15362 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
15365 // Try to lower to vshuf64x2/vshuf32x4.
15366 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
15367 unsigned PermMask = 0;
15368 // Insure elements came from the same Op.
15369 for (int i = 0; i < 4; ++i) {
15370 assert(WidenedMask[i] >= -1);
15371 if (WidenedMask[i] < 0)
15374 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
15375 unsigned OpIndex = i / 2;
15376 if (Ops[OpIndex].isUndef())
15378 else if (Ops[OpIndex] != Op)
15381 // Convert the 128-bit shuffle mask selection values into 128-bit selection
15382 // bits defined by a vshuf64x2 instruction's immediate control byte.
15383 PermMask |= (WidenedMask[i] % 4) << (i * 2);
15386 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
15387 DAG.getConstant(PermMask, DL, MVT::i8));
15390 /// Handle lowering of 8-lane 64-bit floating point shuffles.
15391 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15392 const APInt &Zeroable,
15393 SDValue V1, SDValue V2,
15394 const X86Subtarget &Subtarget,
15395 SelectionDAG &DAG) {
15396 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
15397 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
15398 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15400 if (V2.isUndef()) {
15401 // Use low duplicate instructions for masks that match their pattern.
15402 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
15403 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
15405 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
15406 // Non-half-crossing single input shuffles can be lowered with an
15407 // interleaved permutation.
15408 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15409 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
15410 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
15411 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
15412 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
15413 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
15416 SmallVector<int, 4> RepeatedMask;
15417 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
15418 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
15419 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15422 if (SDValue Shuf128 =
15423 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
15427 if (SDValue Unpck =
15428 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
15431 // Check if the blend happens to exactly fit that of SHUFPD.
15433 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
15436 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
15437 V2, DAG, Subtarget))
15440 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
15441 Zeroable, Subtarget, DAG))
15444 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
15447 /// Handle lowering of 16-lane 32-bit floating point shuffles.
15448 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15449 const APInt &Zeroable,
15450 SDValue V1, SDValue V2,
15451 const X86Subtarget &Subtarget,
15452 SelectionDAG &DAG) {
15453 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
15454 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
15455 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15457 // If the shuffle mask is repeated in each 128-bit lane, we have many more
15458 // options to efficiently lower the shuffle.
15459 SmallVector<int, 4> RepeatedMask;
15460 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
15461 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
15463 // Use even/odd duplicate instructions for masks that match their pattern.
15464 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
15465 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
15466 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
15467 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
15470 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
15471 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15473 // Use dedicated unpack instructions for masks that match their pattern.
15474 if (SDValue Unpck =
15475 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
15478 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
15479 Zeroable, Subtarget, DAG))
15482 // Otherwise, fall back to a SHUFPS sequence.
15483 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
15486 // If we have a single input shuffle with different shuffle patterns in the
15487 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
15488 if (V2.isUndef() &&
15489 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
15490 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
15491 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
15494 // If we have AVX512F support, we can use VEXPAND.
15495 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
15496 V1, V2, DAG, Subtarget))
15499 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
15502 /// Handle lowering of 8-lane 64-bit integer shuffles.
15503 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15504 const APInt &Zeroable,
15505 SDValue V1, SDValue V2,
15506 const X86Subtarget &Subtarget,
15507 SelectionDAG &DAG) {
15508 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
15509 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
15510 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15512 if (V2.isUndef()) {
15513 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15514 // can use lower latency instructions that will operate on all four
15516 SmallVector<int, 2> Repeated128Mask;
15517 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
15518 SmallVector<int, 4> PSHUFDMask;
15519 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
15520 return DAG.getBitcast(
15522 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
15523 DAG.getBitcast(MVT::v16i32, V1),
15524 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15527 SmallVector<int, 4> Repeated256Mask;
15528 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
15529 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
15530 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
15533 if (SDValue Shuf128 =
15534 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
15535 V1, V2, Subtarget, DAG))
15538 // Try to use shift instructions.
15539 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
15540 Zeroable, Subtarget, DAG))
15543 // Try to use VALIGN.
15544 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
15545 Mask, Subtarget, DAG))
15548 // Try to use PALIGNR.
15549 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
15550 Mask, Subtarget, DAG))
15553 if (SDValue Unpck =
15554 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
15556 // If we have AVX512F support, we can use VEXPAND.
15557 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
15558 V2, DAG, Subtarget))
15561 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
15562 Zeroable, Subtarget, DAG))
15565 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
15568 /// Handle lowering of 16-lane 32-bit integer shuffles.
15569 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15570 const APInt &Zeroable,
15571 SDValue V1, SDValue V2,
15572 const X86Subtarget &Subtarget,
15573 SelectionDAG &DAG) {
15574 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
15575 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
15576 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15578 // Whenever we can lower this as a zext, that instruction is strictly faster
15579 // than any alternative. It also allows us to fold memory operands into the
15580 // shuffle in many cases.
15581 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
15582 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15585 // If the shuffle mask is repeated in each 128-bit lane we can use more
15586 // efficient instructions that mirror the shuffles across the four 128-bit
15588 SmallVector<int, 4> RepeatedMask;
15589 bool Is128BitLaneRepeatedShuffle =
15590 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
15591 if (Is128BitLaneRepeatedShuffle) {
15592 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
15594 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
15595 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15597 // Use dedicated unpack instructions for masks that match their pattern.
15599 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
15603 // Try to use shift instructions.
15604 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
15605 Zeroable, Subtarget, DAG))
15608 // Try to use VALIGN.
15609 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
15610 Mask, Subtarget, DAG))
15613 // Try to use byte rotation instructions.
15614 if (Subtarget.hasBWI())
15615 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
15616 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
15619 // Assume that a single SHUFPS is faster than using a permv shuffle.
15620 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15621 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
15622 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
15623 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
15624 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
15625 CastV1, CastV2, DAG);
15626 return DAG.getBitcast(MVT::v16i32, ShufPS);
15628 // If we have AVX512F support, we can use VEXPAND.
15629 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
15630 V1, V2, DAG, Subtarget))
15633 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
15634 Zeroable, Subtarget, DAG))
15636 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
15639 /// Handle lowering of 32-lane 16-bit integer shuffles.
15640 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15641 const APInt &Zeroable,
15642 SDValue V1, SDValue V2,
15643 const X86Subtarget &Subtarget,
15644 SelectionDAG &DAG) {
15645 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
15646 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
15647 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
15648 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
15650 // Whenever we can lower this as a zext, that instruction is strictly faster
15651 // than any alternative. It also allows us to fold memory operands into the
15652 // shuffle in many cases.
15653 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
15654 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15657 // Use dedicated unpack instructions for masks that match their pattern.
15659 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
15662 // Try to use shift instructions.
15663 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
15664 Zeroable, Subtarget, DAG))
15667 // Try to use byte rotation instructions.
15668 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
15669 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
15672 if (V2.isUndef()) {
15673 SmallVector<int, 8> RepeatedMask;
15674 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
15675 // As this is a single-input shuffle, the repeated mask should be
15676 // a strictly valid v8i16 mask that we can pass through to the v8i16
15677 // lowering to handle even the v32 case.
15678 return lowerV8I16GeneralSingleInputVectorShuffle(
15679 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
15683 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
15684 Zeroable, Subtarget, DAG))
15687 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
15688 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
15691 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
15694 /// Handle lowering of 64-lane 8-bit integer shuffles.
15695 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15696 const APInt &Zeroable,
15697 SDValue V1, SDValue V2,
15698 const X86Subtarget &Subtarget,
15699 SelectionDAG &DAG) {
15700 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
15701 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
15702 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
15703 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
15705 // Whenever we can lower this as a zext, that instruction is strictly faster
15706 // than any alternative. It also allows us to fold memory operands into the
15707 // shuffle in many cases.
15708 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
15709 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15712 // Use dedicated unpack instructions for masks that match their pattern.
15714 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
15717 // Use dedicated pack instructions for masks that match their pattern.
15718 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
15722 // Try to use shift instructions.
15723 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
15724 Zeroable, Subtarget, DAG))
15727 // Try to use byte rotation instructions.
15728 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
15729 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
15732 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
15733 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
15736 // VBMI can use VPERMV/VPERMV3 byte shuffles.
15737 if (Subtarget.hasVBMI())
15738 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
15740 // Try to create an in-lane repeating shuffle mask and then shuffle the
15741 // results into the target lanes.
15742 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15743 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
15746 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
15747 Zeroable, Subtarget, DAG))
15750 // FIXME: Implement direct support for this type!
15751 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
15754 /// High-level routine to lower various 512-bit x86 vector shuffles.
15756 /// This routine either breaks down the specific type of a 512-bit x86 vector
15757 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
15758 /// together based on the available instructions.
15759 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15760 MVT VT, SDValue V1, SDValue V2,
15761 const APInt &Zeroable,
15762 const X86Subtarget &Subtarget,
15763 SelectionDAG &DAG) {
15764 assert(Subtarget.hasAVX512() &&
15765 "Cannot lower 512-bit vectors w/ basic ISA!");
15767 // If we have a single input to the zero element, insert that into V1 if we
15768 // can do so cheaply.
15769 int NumElts = Mask.size();
15770 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
15772 if (NumV2Elements == 1 && Mask[0] >= NumElts)
15773 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
15774 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
15777 // Handle special cases where the lower or upper half is UNDEF.
15779 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
15782 // Check for being able to broadcast a single element.
15783 if (SDValue Broadcast =
15784 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
15787 // Dispatch to each element type for lowering. If we don't have support for
15788 // specific element type shuffles at 512 bits, immediately split them and
15789 // lower them. Each lowering routine of a given type is allowed to assume that
15790 // the requisite ISA extensions for that element type are available.
15791 switch (VT.SimpleTy) {
15793 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15795 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15797 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15799 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15801 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15803 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15806 llvm_unreachable("Not a valid 512-bit x86 vector type!");
15810 // Determine if this shuffle can be implemented with a KSHIFT instruction.
15811 // Returns the shift amount if possible or -1 if not. This is a simplified
15812 // version of matchVectorShuffleAsShift.
15813 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
15814 int MaskOffset, const APInt &Zeroable) {
15815 int Size = Mask.size();
15817 auto CheckZeros = [&](int Shift, bool Left) {
15818 for (int j = 0; j < Shift; ++j)
15819 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
15825 auto MatchShift = [&](int Shift, bool Left) {
15826 unsigned Pos = Left ? Shift : 0;
15827 unsigned Low = Left ? 0 : Shift;
15828 unsigned Len = Size - Shift;
15829 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
15832 for (int Shift = 1; Shift != Size; ++Shift)
15833 for (bool Left : {true, false})
15834 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
15835 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
15843 // Lower vXi1 vector shuffles.
15844 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
15845 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
15846 // vector, shuffle and then truncate it back.
15847 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15848 MVT VT, SDValue V1, SDValue V2,
15849 const APInt &Zeroable,
15850 const X86Subtarget &Subtarget,
15851 SelectionDAG &DAG) {
15852 assert(Subtarget.hasAVX512() &&
15853 "Cannot lower 512-bit vectors w/o basic ISA!");
15855 unsigned NumElts = Mask.size();
15857 // Try to recognize shuffles that are just padding a subvector with zeros.
15858 unsigned SubvecElts = 0;
15859 for (int i = 0; i != (int)NumElts; ++i) {
15860 if (Mask[i] >= 0 && Mask[i] != i)
15865 assert(SubvecElts != NumElts && "Identity shuffle?");
15867 // Clip to a power 2.
15868 SubvecElts = PowerOf2Floor(SubvecElts);
15870 // Make sure the number of zeroable bits in the top at least covers the bits
15871 // not covered by the subvector.
15872 if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
15873 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
15874 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
15875 V1, DAG.getIntPtrConstant(0, DL));
15876 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15877 getZeroVector(VT, Subtarget, DAG, DL),
15878 Extract, DAG.getIntPtrConstant(0, DL));
15881 // Try to match KSHIFTs.
15882 // TODO: Support narrower than legal shifts by widening and extracting.
15883 if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) {
15884 unsigned Offset = 0;
15885 for (SDValue V : { V1, V2 }) {
15887 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
15889 return DAG.getNode(Opcode, DL, VT, V,
15890 DAG.getConstant(ShiftAmt, DL, MVT::i8));
15891 Offset += NumElts; // Increment for next iteration.
15897 switch (VT.SimpleTy) {
15899 llvm_unreachable("Expected a vector of i1 elements");
15901 ExtVT = MVT::v2i64;
15904 ExtVT = MVT::v4i32;
15907 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
15909 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
15912 // Take 512-bit type, unless we are avoiding 512-bit types and have the
15913 // 256-bit operation available.
15914 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
15917 // Take 512-bit type, unless we are avoiding 512-bit types and have the
15918 // 256-bit operation available.
15919 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
15920 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
15923 ExtVT = MVT::v64i8;
15927 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
15928 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
15930 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
15931 // i1 was sign extended we can use X86ISD::CVT2MASK.
15932 int NumElems = VT.getVectorNumElements();
15933 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
15934 (Subtarget.hasDQI() && (NumElems < 32)))
15935 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
15936 Shuffle, ISD::SETGT);
15938 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
15941 /// Helper function that returns true if the shuffle mask should be
15942 /// commuted to improve canonicalization.
15943 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
15944 int NumElements = Mask.size();
15946 int NumV1Elements = 0, NumV2Elements = 0;
15950 else if (M < NumElements)
15955 // Commute the shuffle as needed such that more elements come from V1 than
15956 // V2. This allows us to match the shuffle pattern strictly on how many
15957 // elements come from V1 without handling the symmetric cases.
15958 if (NumV2Elements > NumV1Elements)
15961 assert(NumV1Elements > 0 && "No V1 indices");
15963 if (NumV2Elements == 0)
15966 // When the number of V1 and V2 elements are the same, try to minimize the
15967 // number of uses of V2 in the low half of the vector. When that is tied,
15968 // ensure that the sum of indices for V1 is equal to or lower than the sum
15969 // indices for V2. When those are equal, try to ensure that the number of odd
15970 // indices for V1 is lower than the number of odd indices for V2.
15971 if (NumV1Elements == NumV2Elements) {
15972 int LowV1Elements = 0, LowV2Elements = 0;
15973 for (int M : Mask.slice(0, NumElements / 2))
15974 if (M >= NumElements)
15978 if (LowV2Elements > LowV1Elements)
15980 if (LowV2Elements == LowV1Elements) {
15981 int SumV1Indices = 0, SumV2Indices = 0;
15982 for (int i = 0, Size = Mask.size(); i < Size; ++i)
15983 if (Mask[i] >= NumElements)
15985 else if (Mask[i] >= 0)
15987 if (SumV2Indices < SumV1Indices)
15989 if (SumV2Indices == SumV1Indices) {
15990 int NumV1OddIndices = 0, NumV2OddIndices = 0;
15991 for (int i = 0, Size = Mask.size(); i < Size; ++i)
15992 if (Mask[i] >= NumElements)
15993 NumV2OddIndices += i % 2;
15994 else if (Mask[i] >= 0)
15995 NumV1OddIndices += i % 2;
15996 if (NumV2OddIndices < NumV1OddIndices)
16005 /// Top-level lowering for x86 vector shuffles.
16007 /// This handles decomposition, canonicalization, and lowering of all x86
16008 /// vector shuffles. Most of the specific lowering strategies are encapsulated
16009 /// above in helper routines. The canonicalization attempts to widen shuffles
16010 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
16011 /// s.t. only one of the two inputs needs to be tested, etc.
16012 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
16013 SelectionDAG &DAG) {
16014 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
16015 ArrayRef<int> Mask = SVOp->getMask();
16016 SDValue V1 = Op.getOperand(0);
16017 SDValue V2 = Op.getOperand(1);
16018 MVT VT = Op.getSimpleValueType();
16019 int NumElements = VT.getVectorNumElements();
16021 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
16023 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
16024 "Can't lower MMX shuffles");
16026 bool V1IsUndef = V1.isUndef();
16027 bool V2IsUndef = V2.isUndef();
16028 if (V1IsUndef && V2IsUndef)
16029 return DAG.getUNDEF(VT);
16031 // When we create a shuffle node we put the UNDEF node to second operand,
16032 // but in some cases the first operand may be transformed to UNDEF.
16033 // In this case we should just commute the node.
16035 return DAG.getCommutedVectorShuffle(*SVOp);
16037 // Check for non-undef masks pointing at an undef vector and make the masks
16038 // undef as well. This makes it easier to match the shuffle based solely on
16042 if (M >= NumElements) {
16043 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
16044 for (int &M : NewMask)
16045 if (M >= NumElements)
16047 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16050 // Check for illegal shuffle mask element index values.
16051 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
16052 assert(llvm::all_of(Mask,
16053 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
16054 "Out of bounds shuffle index");
16056 // We actually see shuffles that are entirely re-arrangements of a set of
16057 // zero inputs. This mostly happens while decomposing complex shuffles into
16058 // simple ones. Directly lower these as a buildvector of zeros.
16059 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
16060 if (Zeroable.isAllOnesValue())
16061 return getZeroVector(VT, Subtarget, DAG, DL);
16063 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
16065 // Create an alternative mask with info about zeroable elements.
16066 // Here we do not set undef elements as zeroable.
16067 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
16069 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
16070 for (int i = 0; i != NumElements; ++i)
16071 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
16072 ZeroableMask[i] = SM_SentinelZero;
16075 // Try to collapse shuffles into using a vector type with fewer elements but
16076 // wider element types. We cap this to not form integers or floating point
16077 // elements wider than 64 bits, but it might be interesting to form i128
16078 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
16079 SmallVector<int, 16> WidenedMask;
16080 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
16081 canWidenShuffleElements(ZeroableMask, WidenedMask)) {
16082 // Shuffle mask widening should not interfere with a broadcast opportunity
16083 // by obfuscating the operands with bitcasts.
16084 // TODO: Avoid lowering directly from this top-level function: make this
16085 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
16086 if (SDValue Broadcast =
16087 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
16090 MVT NewEltVT = VT.isFloatingPoint()
16091 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
16092 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
16093 int NewNumElts = NumElements / 2;
16094 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
16095 // Make sure that the new vector type is legal. For example, v2f64 isn't
16097 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
16099 // Modify the new Mask to take all zeros from the all-zero vector.
16100 // Choose indices that are blend-friendly.
16101 bool UsedZeroVector = false;
16102 assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
16103 "V2's non-undef elements are used?!");
16104 for (int i = 0; i != NewNumElts; ++i)
16105 if (WidenedMask[i] == SM_SentinelZero) {
16106 WidenedMask[i] = i + NewNumElts;
16107 UsedZeroVector = true;
16109 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
16110 // some elements to be undef.
16111 if (UsedZeroVector)
16112 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
16114 V1 = DAG.getBitcast(NewVT, V1);
16115 V2 = DAG.getBitcast(NewVT, V2);
16116 return DAG.getBitcast(
16117 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
16121 // Commute the shuffle if it will improve canonicalization.
16122 if (canonicalizeShuffleMaskWithCommute(Mask))
16123 return DAG.getCommutedVectorShuffle(*SVOp);
16126 lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
16129 // For each vector width, delegate to a specialized lowering routine.
16130 if (VT.is128BitVector())
16131 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
16134 if (VT.is256BitVector())
16135 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
16138 if (VT.is512BitVector())
16139 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
16143 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
16146 llvm_unreachable("Unimplemented!");
16149 /// Try to lower a VSELECT instruction to a vector shuffle.
16150 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
16151 const X86Subtarget &Subtarget,
16152 SelectionDAG &DAG) {
16153 SDValue Cond = Op.getOperand(0);
16154 SDValue LHS = Op.getOperand(1);
16155 SDValue RHS = Op.getOperand(2);
16156 MVT VT = Op.getSimpleValueType();
16158 // Only non-legal VSELECTs reach this lowering, convert those into generic
16159 // shuffles and re-use the shuffle lowering path for blends.
16160 SmallVector<int, 32> Mask;
16161 if (createShuffleMaskFromVSELECT(Mask, Cond))
16162 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
16167 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
16168 SDValue Cond = Op.getOperand(0);
16169 SDValue LHS = Op.getOperand(1);
16170 SDValue RHS = Op.getOperand(2);
16172 // A vselect where all conditions and data are constants can be optimized into
16173 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
16174 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
16175 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
16176 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
16179 // Try to lower this to a blend-style vector shuffle. This can handle all
16180 // constant condition cases.
16181 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
16184 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
16185 // with patterns on the mask registers on AVX-512.
16186 MVT CondVT = Cond.getSimpleValueType();
16187 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
16188 if (CondEltSize == 1)
16191 // Variable blends are only legal from SSE4.1 onward.
16192 if (!Subtarget.hasSSE41())
16196 MVT VT = Op.getSimpleValueType();
16197 unsigned EltSize = VT.getScalarSizeInBits();
16198 unsigned NumElts = VT.getVectorNumElements();
16200 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
16201 // into an i1 condition so that we can use the mask-based 512-bit blend
16203 if (VT.getSizeInBits() == 512) {
16204 // Build a mask by testing the condition against zero.
16205 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
16206 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
16207 DAG.getConstant(0, dl, CondVT),
16209 // Now return a new VSELECT using the mask.
16210 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
16213 // SEXT/TRUNC cases where the mask doesn't match the destination size.
16214 if (CondEltSize != EltSize) {
16215 // If we don't have a sign splat, rely on the expansion.
16216 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
16219 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
16220 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
16221 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
16222 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
16225 // Only some types will be legal on some subtargets. If we can emit a legal
16226 // VSELECT-matching blend, return Op, and but if we need to expand, return
16228 switch (VT.SimpleTy) {
16230 // Most of the vector types have blends past SSE4.1.
16234 // The byte blends for AVX vectors were introduced only in AVX2.
16235 if (Subtarget.hasAVX2())
16241 case MVT::v16i16: {
16242 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
16243 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
16244 Cond = DAG.getBitcast(CastVT, Cond);
16245 LHS = DAG.getBitcast(CastVT, LHS);
16246 RHS = DAG.getBitcast(CastVT, RHS);
16247 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
16248 return DAG.getBitcast(VT, Select);
16253 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
16254 MVT VT = Op.getSimpleValueType();
16257 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
16260 if (VT.getSizeInBits() == 8) {
16261 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
16262 Op.getOperand(0), Op.getOperand(1));
16263 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
16266 if (VT == MVT::f32) {
16267 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
16268 // the result back to FR32 register. It's only worth matching if the
16269 // result has a single use which is a store or a bitcast to i32. And in
16270 // the case of a store, it's not worth it if the index is a constant 0,
16271 // because a MOVSSmr can be used instead, which is smaller and faster.
16272 if (!Op.hasOneUse())
16274 SDNode *User = *Op.getNode()->use_begin();
16275 if ((User->getOpcode() != ISD::STORE ||
16276 isNullConstant(Op.getOperand(1))) &&
16277 (User->getOpcode() != ISD::BITCAST ||
16278 User->getValueType(0) != MVT::i32))
16280 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
16281 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
16283 return DAG.getBitcast(MVT::f32, Extract);
16286 if (VT == MVT::i32 || VT == MVT::i64) {
16287 // ExtractPS/pextrq works with constant index.
16288 if (isa<ConstantSDNode>(Op.getOperand(1)))
16295 /// Extract one bit from mask vector, like v16i1 or v8i1.
16296 /// AVX-512 feature.
16297 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
16298 const X86Subtarget &Subtarget) {
16299 SDValue Vec = Op.getOperand(0);
16301 MVT VecVT = Vec.getSimpleValueType();
16302 SDValue Idx = Op.getOperand(1);
16303 MVT EltVT = Op.getSimpleValueType();
16305 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
16306 "Unexpected vector type in ExtractBitFromMaskVector");
16308 // variable index can't be handled in mask registers,
16309 // extend vector to VR512/128
16310 if (!isa<ConstantSDNode>(Idx)) {
16311 unsigned NumElts = VecVT.getVectorNumElements();
16312 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
16313 // than extending to 128/256bit.
16314 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
16315 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
16316 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
16317 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
16318 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
16321 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
16322 if (IdxVal == 0) // the operation is legal
16325 // Extend to natively supported kshift.
16326 unsigned NumElems = VecVT.getVectorNumElements();
16327 MVT WideVecVT = VecVT;
16328 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
16329 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
16330 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
16331 DAG.getUNDEF(WideVecVT), Vec,
16332 DAG.getIntPtrConstant(0, dl));
16335 // Use kshiftr instruction to move to the lower element.
16336 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
16337 DAG.getConstant(IdxVal, dl, MVT::i8));
16339 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
16340 DAG.getIntPtrConstant(0, dl));
16344 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16345 SelectionDAG &DAG) const {
16347 SDValue Vec = Op.getOperand(0);
16348 MVT VecVT = Vec.getSimpleValueType();
16349 SDValue Idx = Op.getOperand(1);
16351 if (VecVT.getVectorElementType() == MVT::i1)
16352 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
16354 if (!isa<ConstantSDNode>(Idx)) {
16355 // Its more profitable to go through memory (1 cycles throughput)
16356 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
16357 // IACA tool was used to get performance estimation
16358 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
16360 // example : extractelement <16 x i8> %a, i32 %i
16362 // Block Throughput: 3.00 Cycles
16363 // Throughput Bottleneck: Port5
16365 // | Num Of | Ports pressure in cycles | |
16366 // | Uops | 0 - DV | 5 | 6 | 7 | |
16367 // ---------------------------------------------
16368 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
16369 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
16370 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
16371 // Total Num Of Uops: 4
16374 // Block Throughput: 1.00 Cycles
16375 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
16377 // | | Ports pressure in cycles | |
16378 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
16379 // ---------------------------------------------------------
16380 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
16381 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
16382 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
16383 // Total Num Of Uops: 4
16388 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
16390 // If this is a 256-bit vector result, first extract the 128-bit vector and
16391 // then extract the element from the 128-bit vector.
16392 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
16393 // Get the 128-bit vector.
16394 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
16395 MVT EltVT = VecVT.getVectorElementType();
16397 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
16398 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
16400 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
16401 // this can be done with a mask.
16402 IdxVal &= ElemsPerChunk - 1;
16403 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
16404 DAG.getConstant(IdxVal, dl, MVT::i32));
16407 assert(VecVT.is128BitVector() && "Unexpected vector length");
16409 MVT VT = Op.getSimpleValueType();
16411 if (VT.getSizeInBits() == 16) {
16412 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
16413 // we're going to zero extend the register or fold the store (SSE41 only).
16414 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
16415 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
16416 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
16417 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
16418 DAG.getBitcast(MVT::v4i32, Vec), Idx));
16420 // Transform it so it match pextrw which produces a 32-bit result.
16421 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
16422 Op.getOperand(0), Op.getOperand(1));
16423 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
16426 if (Subtarget.hasSSE41())
16427 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
16430 // TODO: We only extract a single element from v16i8, we can probably afford
16431 // to be more aggressive here before using the default approach of spilling to
16433 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
16434 // Extract either the lowest i32 or any i16, and extract the sub-byte.
16435 int DWordIdx = IdxVal / 4;
16436 if (DWordIdx == 0) {
16437 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
16438 DAG.getBitcast(MVT::v4i32, Vec),
16439 DAG.getIntPtrConstant(DWordIdx, dl));
16440 int ShiftVal = (IdxVal % 4) * 8;
16442 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
16443 DAG.getConstant(ShiftVal, dl, MVT::i8));
16444 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16447 int WordIdx = IdxVal / 2;
16448 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
16449 DAG.getBitcast(MVT::v8i16, Vec),
16450 DAG.getIntPtrConstant(WordIdx, dl));
16451 int ShiftVal = (IdxVal % 2) * 8;
16453 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
16454 DAG.getConstant(ShiftVal, dl, MVT::i8));
16455 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16458 if (VT.getSizeInBits() == 32) {
16462 // SHUFPS the element to the lowest double word, then movss.
16463 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
16464 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
16465 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
16466 DAG.getIntPtrConstant(0, dl));
16469 if (VT.getSizeInBits() == 64) {
16470 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
16471 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
16472 // to match extract_elt for f64.
16476 // UNPCKHPD the element to the lowest double word, then movsd.
16477 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
16478 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
16479 int Mask[2] = { 1, -1 };
16480 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
16481 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
16482 DAG.getIntPtrConstant(0, dl));
16488 /// Insert one bit to mask vector, like v16i1 or v8i1.
16489 /// AVX-512 feature.
16490 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
16491 const X86Subtarget &Subtarget) {
16493 SDValue Vec = Op.getOperand(0);
16494 SDValue Elt = Op.getOperand(1);
16495 SDValue Idx = Op.getOperand(2);
16496 MVT VecVT = Vec.getSimpleValueType();
16498 if (!isa<ConstantSDNode>(Idx)) {
16499 // Non constant index. Extend source and destination,
16500 // insert element and then truncate the result.
16501 unsigned NumElts = VecVT.getVectorNumElements();
16502 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
16503 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
16504 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
16505 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
16506 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
16507 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
16510 // Copy into a k-register, extract to v1i1 and insert_subvector.
16511 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
16513 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
16517 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
16518 SelectionDAG &DAG) const {
16519 MVT VT = Op.getSimpleValueType();
16520 MVT EltVT = VT.getVectorElementType();
16521 unsigned NumElts = VT.getVectorNumElements();
16523 if (EltVT == MVT::i1)
16524 return InsertBitToMaskVector(Op, DAG, Subtarget);
16527 SDValue N0 = Op.getOperand(0);
16528 SDValue N1 = Op.getOperand(1);
16529 SDValue N2 = Op.getOperand(2);
16530 if (!isa<ConstantSDNode>(N2))
16532 auto *N2C = cast<ConstantSDNode>(N2);
16533 unsigned IdxVal = N2C->getZExtValue();
16535 bool IsZeroElt = X86::isZeroNode(N1);
16536 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
16538 // If we are inserting a element, see if we can do this more efficiently with
16539 // a blend shuffle with a rematerializable vector than a costly integer
16541 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
16542 16 <= EltVT.getSizeInBits()) {
16543 SmallVector<int, 8> BlendMask;
16544 for (unsigned i = 0; i != NumElts; ++i)
16545 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
16546 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
16547 : getOnesVector(VT, DAG, dl);
16548 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
16551 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
16552 // into that, and then insert the subvector back into the result.
16553 if (VT.is256BitVector() || VT.is512BitVector()) {
16554 // With a 256-bit vector, we can insert into the zero element efficiently
16555 // using a blend if we have AVX or AVX2 and the right data type.
16556 if (VT.is256BitVector() && IdxVal == 0) {
16557 // TODO: It is worthwhile to cast integer to floating point and back
16558 // and incur a domain crossing penalty if that's what we'll end up
16559 // doing anyway after extracting to a 128-bit vector.
16560 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
16561 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
16562 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
16563 N2 = DAG.getIntPtrConstant(1, dl);
16564 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
16568 // Get the desired 128-bit vector chunk.
16569 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
16571 // Insert the element into the desired chunk.
16572 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
16573 assert(isPowerOf2_32(NumEltsIn128));
16574 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
16575 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
16577 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
16578 DAG.getConstant(IdxIn128, dl, MVT::i32));
16580 // Insert the changed part back into the bigger vector
16581 return insert128BitVector(N0, V, IdxVal, DAG, dl);
16583 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
16585 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
16586 // argument. SSE41 required for pinsrb.
16587 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
16589 if (VT == MVT::v8i16) {
16590 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
16591 Opc = X86ISD::PINSRW;
16593 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
16594 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
16595 Opc = X86ISD::PINSRB;
16598 if (N1.getValueType() != MVT::i32)
16599 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
16600 if (N2.getValueType() != MVT::i32)
16601 N2 = DAG.getIntPtrConstant(IdxVal, dl);
16602 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
16605 if (Subtarget.hasSSE41()) {
16606 if (EltVT == MVT::f32) {
16607 // Bits [7:6] of the constant are the source select. This will always be
16608 // zero here. The DAG Combiner may combine an extract_elt index into
16609 // these bits. For example (insert (extract, 3), 2) could be matched by
16610 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
16611 // Bits [5:4] of the constant are the destination select. This is the
16612 // value of the incoming immediate.
16613 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
16614 // combine either bitwise AND or insert of float 0.0 to set these bits.
16616 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
16617 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
16618 // If this is an insertion of 32-bits into the low 32-bits of
16619 // a vector, we prefer to generate a blend with immediate rather
16620 // than an insertps. Blends are simpler operations in hardware and so
16621 // will always have equal or better performance than insertps.
16622 // But if optimizing for size and there's a load folding opportunity,
16623 // generate insertps because blendps does not have a 32-bit memory
16625 N2 = DAG.getIntPtrConstant(1, dl);
16626 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
16627 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
16629 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
16630 // Create this as a scalar to vector..
16631 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
16632 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
16635 // PINSR* works with constant index.
16636 if (EltVT == MVT::i32 || EltVT == MVT::i64)
16643 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
16644 SelectionDAG &DAG) {
16646 MVT OpVT = Op.getSimpleValueType();
16648 // It's always cheaper to replace a xor+movd with xorps and simplifies further
16650 if (X86::isZeroNode(Op.getOperand(0)))
16651 return getZeroVector(OpVT, Subtarget, DAG, dl);
16653 // If this is a 256-bit vector result, first insert into a 128-bit
16654 // vector and then insert into the 256-bit vector.
16655 if (!OpVT.is128BitVector()) {
16656 // Insert into a 128-bit vector.
16657 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
16658 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
16659 OpVT.getVectorNumElements() / SizeFactor);
16661 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
16663 // Insert the 128-bit vector.
16664 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
16666 assert(OpVT.is128BitVector() && "Expected an SSE type!");
16668 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
16669 if (OpVT == MVT::v4i32)
16672 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
16673 return DAG.getBitcast(
16674 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
16677 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
16678 // simple superregister reference or explicit instructions to insert
16679 // the upper bits of a vector.
16680 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
16681 SelectionDAG &DAG) {
16682 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
16684 return insert1BitVector(Op, DAG, Subtarget);
16687 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
16688 SelectionDAG &DAG) {
16689 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16690 "Only vXi1 extract_subvectors need custom lowering");
16693 SDValue Vec = Op.getOperand(0);
16694 SDValue Idx = Op.getOperand(1);
16696 if (!isa<ConstantSDNode>(Idx))
16699 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
16700 if (IdxVal == 0) // the operation is legal
16703 MVT VecVT = Vec.getSimpleValueType();
16704 unsigned NumElems = VecVT.getVectorNumElements();
16706 // Extend to natively supported kshift.
16707 MVT WideVecVT = VecVT;
16708 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
16709 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
16710 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
16711 DAG.getUNDEF(WideVecVT), Vec,
16712 DAG.getIntPtrConstant(0, dl));
16715 // Shift to the LSB.
16716 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
16717 DAG.getConstant(IdxVal, dl, MVT::i8));
16719 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
16720 DAG.getIntPtrConstant(0, dl));
16723 // Returns the appropriate wrapper opcode for a global reference.
16724 unsigned X86TargetLowering::getGlobalWrapperKind(
16725 const GlobalValue *GV, const unsigned char OpFlags) const {
16726 // References to absolute symbols are never PC-relative.
16727 if (GV && GV->isAbsoluteSymbolRef())
16728 return X86ISD::Wrapper;
16730 CodeModel::Model M = getTargetMachine().getCodeModel();
16731 if (Subtarget.isPICStyleRIPRel() &&
16732 (M == CodeModel::Small || M == CodeModel::Kernel))
16733 return X86ISD::WrapperRIP;
16735 // GOTPCREL references must always use RIP.
16736 if (OpFlags == X86II::MO_GOTPCREL)
16737 return X86ISD::WrapperRIP;
16739 return X86ISD::Wrapper;
16742 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
16743 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
16744 // one of the above mentioned nodes. It has to be wrapped because otherwise
16745 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
16746 // be used to form addressing mode. These wrapped nodes will be selected
16749 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
16750 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
16752 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
16753 // global base reg.
16754 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
16756 auto PtrVT = getPointerTy(DAG.getDataLayout());
16757 SDValue Result = DAG.getTargetConstantPool(
16758 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
16760 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
16761 // With PIC, the address is actually $g + Offset.
16764 DAG.getNode(ISD::ADD, DL, PtrVT,
16765 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
16771 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
16772 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
16774 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
16775 // global base reg.
16776 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
16778 auto PtrVT = getPointerTy(DAG.getDataLayout());
16779 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
16781 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
16783 // With PIC, the address is actually $g + Offset.
16786 DAG.getNode(ISD::ADD, DL, PtrVT,
16787 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
16793 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
16794 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
16796 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
16797 // global base reg.
16798 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
16799 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
16801 auto PtrVT = getPointerTy(DAG.getDataLayout());
16802 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
16805 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
16807 // With PIC, the address is actually $g + Offset.
16810 DAG.getNode(ISD::ADD, DL, PtrVT,
16811 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
16814 // For symbols that require a load from a stub to get the address, emit the
16816 if (isGlobalStubReference(OpFlag))
16817 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
16818 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
16824 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
16825 // Create the TargetBlockAddressAddress node.
16826 unsigned char OpFlags =
16827 Subtarget.classifyBlockAddressReference();
16828 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
16829 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
16831 auto PtrVT = getPointerTy(DAG.getDataLayout());
16832 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
16833 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
16835 // With PIC, the address is actually $g + Offset.
16836 if (isGlobalRelativeToPICBase(OpFlags)) {
16837 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
16838 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
16844 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
16845 const SDLoc &dl, int64_t Offset,
16846 SelectionDAG &DAG) const {
16847 // Create the TargetGlobalAddress node, folding in the constant
16848 // offset if it is legal.
16849 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
16850 CodeModel::Model M = DAG.getTarget().getCodeModel();
16851 auto PtrVT = getPointerTy(DAG.getDataLayout());
16853 if (OpFlags == X86II::MO_NO_FLAG &&
16854 X86::isOffsetSuitableForCodeModel(Offset, M)) {
16855 // A direct static reference to a global.
16856 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
16859 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
16862 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
16864 // With PIC, the address is actually $g + Offset.
16865 if (isGlobalRelativeToPICBase(OpFlags)) {
16866 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
16867 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
16870 // For globals that require a load from a stub to get the address, emit the
16872 if (isGlobalStubReference(OpFlags))
16873 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
16874 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
16876 // If there was a non-zero offset that we didn't fold, create an explicit
16877 // addition for it.
16879 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
16880 DAG.getConstant(Offset, dl, PtrVT));
16886 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
16887 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
16888 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
16889 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
16893 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
16894 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
16895 unsigned char OperandFlags, bool LocalDynamic = false) {
16896 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
16897 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16899 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
16900 GA->getValueType(0),
16904 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
16908 SDValue Ops[] = { Chain, TGA, *InFlag };
16909 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
16911 SDValue Ops[] = { Chain, TGA };
16912 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
16915 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
16916 MFI.setAdjustsStack(true);
16917 MFI.setHasCalls(true);
16919 SDValue Flag = Chain.getValue(1);
16920 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
16923 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
16925 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
16928 SDLoc dl(GA); // ? function entry point might be better
16929 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
16930 DAG.getNode(X86ISD::GlobalBaseReg,
16931 SDLoc(), PtrVT), InFlag);
16932 InFlag = Chain.getValue(1);
16934 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
16937 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
16939 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
16941 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
16942 X86::RAX, X86II::MO_TLSGD);
16945 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
16951 // Get the start address of the TLS block for this module.
16952 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
16953 .getInfo<X86MachineFunctionInfo>();
16954 MFI->incNumLocalDynamicTLSAccesses();
16958 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
16959 X86II::MO_TLSLD, /*LocalDynamic=*/true);
16962 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
16963 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
16964 InFlag = Chain.getValue(1);
16965 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
16966 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
16969 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
16973 unsigned char OperandFlags = X86II::MO_DTPOFF;
16974 unsigned WrapperKind = X86ISD::Wrapper;
16975 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
16976 GA->getValueType(0),
16977 GA->getOffset(), OperandFlags);
16978 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
16980 // Add x@dtpoff with the base.
16981 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
16984 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
16985 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
16986 const EVT PtrVT, TLSModel::Model model,
16987 bool is64Bit, bool isPIC) {
16990 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
16991 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
16992 is64Bit ? 257 : 256));
16994 SDValue ThreadPointer =
16995 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
16996 MachinePointerInfo(Ptr));
16998 unsigned char OperandFlags = 0;
16999 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
17001 unsigned WrapperKind = X86ISD::Wrapper;
17002 if (model == TLSModel::LocalExec) {
17003 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
17004 } else if (model == TLSModel::InitialExec) {
17006 OperandFlags = X86II::MO_GOTTPOFF;
17007 WrapperKind = X86ISD::WrapperRIP;
17009 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
17012 llvm_unreachable("Unexpected model");
17015 // emit "addl x@ntpoff,%eax" (local exec)
17016 // or "addl x@indntpoff,%eax" (initial exec)
17017 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
17019 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
17020 GA->getOffset(), OperandFlags);
17021 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
17023 if (model == TLSModel::InitialExec) {
17024 if (isPIC && !is64Bit) {
17025 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
17026 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
17030 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
17031 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
17034 // The address of the thread local variable is the add of the thread
17035 // pointer with the offset of the variable.
17036 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
17040 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
17042 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
17044 if (DAG.getTarget().useEmulatedTLS())
17045 return LowerToTLSEmulatedModel(GA, DAG);
17047 const GlobalValue *GV = GA->getGlobal();
17048 auto PtrVT = getPointerTy(DAG.getDataLayout());
17049 bool PositionIndependent = isPositionIndependent();
17051 if (Subtarget.isTargetELF()) {
17052 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
17054 case TLSModel::GeneralDynamic:
17055 if (Subtarget.is64Bit())
17056 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
17057 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
17058 case TLSModel::LocalDynamic:
17059 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
17060 Subtarget.is64Bit());
17061 case TLSModel::InitialExec:
17062 case TLSModel::LocalExec:
17063 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
17064 PositionIndependent);
17066 llvm_unreachable("Unknown TLS model.");
17069 if (Subtarget.isTargetDarwin()) {
17070 // Darwin only has one model of TLS. Lower to that.
17071 unsigned char OpFlag = 0;
17072 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
17073 X86ISD::WrapperRIP : X86ISD::Wrapper;
17075 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
17076 // global base reg.
17077 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
17079 OpFlag = X86II::MO_TLVP_PIC_BASE;
17081 OpFlag = X86II::MO_TLVP;
17083 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
17084 GA->getValueType(0),
17085 GA->getOffset(), OpFlag);
17086 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
17088 // With PIC32, the address is actually $g + Offset.
17090 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
17091 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
17094 // Lowering the machine isd will make sure everything is in the right
17096 SDValue Chain = DAG.getEntryNode();
17097 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
17098 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
17099 SDValue Args[] = { Chain, Offset };
17100 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
17101 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
17102 DAG.getIntPtrConstant(0, DL, true),
17103 Chain.getValue(1), DL);
17105 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
17106 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
17107 MFI.setAdjustsStack(true);
17109 // And our return value (tls address) is in the standard call return value
17111 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
17112 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
17115 if (Subtarget.isTargetKnownWindowsMSVC() ||
17116 Subtarget.isTargetWindowsItanium() ||
17117 Subtarget.isTargetWindowsGNU()) {
17118 // Just use the implicit TLS architecture
17119 // Need to generate something similar to:
17120 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
17122 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
17123 // mov rcx, qword [rdx+rcx*8]
17124 // mov eax, .tls$:tlsvar
17125 // [rax+rcx] contains the address
17126 // Windows 64bit: gs:0x58
17127 // Windows 32bit: fs:__tls_array
17130 SDValue Chain = DAG.getEntryNode();
17132 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
17133 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
17134 // use its literal value of 0x2C.
17135 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
17136 ? Type::getInt8PtrTy(*DAG.getContext(),
17138 : Type::getInt32PtrTy(*DAG.getContext(),
17141 SDValue TlsArray = Subtarget.is64Bit()
17142 ? DAG.getIntPtrConstant(0x58, dl)
17143 : (Subtarget.isTargetWindowsGNU()
17144 ? DAG.getIntPtrConstant(0x2C, dl)
17145 : DAG.getExternalSymbol("_tls_array", PtrVT));
17147 SDValue ThreadPointer =
17148 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
17151 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
17152 res = ThreadPointer;
17154 // Load the _tls_index variable
17155 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
17156 if (Subtarget.is64Bit())
17157 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
17158 MachinePointerInfo(), MVT::i32);
17160 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
17162 auto &DL = DAG.getDataLayout();
17164 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
17165 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
17167 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
17170 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
17172 // Get the offset of start of .tls section
17173 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
17174 GA->getValueType(0),
17175 GA->getOffset(), X86II::MO_SECREL);
17176 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
17178 // The address of the thread local variable is the add of the thread
17179 // pointer with the offset of the variable.
17180 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
17183 llvm_unreachable("TLS not implemented for this target.");
17186 /// Lower SRA_PARTS and friends, which return two i32 values
17187 /// and take a 2 x i32 value to shift plus a shift amount.
17188 /// TODO: Can this be moved to general expansion code?
17189 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
17190 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
17191 MVT VT = Op.getSimpleValueType();
17192 unsigned VTBits = VT.getSizeInBits();
17194 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
17195 SDValue ShOpLo = Op.getOperand(0);
17196 SDValue ShOpHi = Op.getOperand(1);
17197 SDValue ShAmt = Op.getOperand(2);
17198 // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
17199 // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
17201 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
17202 DAG.getConstant(VTBits - 1, dl, MVT::i8));
17203 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
17204 DAG.getConstant(VTBits - 1, dl, MVT::i8))
17205 : DAG.getConstant(0, dl, VT);
17207 SDValue Tmp2, Tmp3;
17208 if (Op.getOpcode() == ISD::SHL_PARTS) {
17209 Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
17210 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
17212 Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
17213 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
17216 // If the shift amount is larger or equal than the width of a part we can't
17217 // rely on the results of shld/shrd. Insert a test and select the appropriate
17218 // values for large shift amounts.
17219 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
17220 DAG.getConstant(VTBits, dl, MVT::i8));
17221 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
17222 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
17225 if (Op.getOpcode() == ISD::SHL_PARTS) {
17226 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
17227 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
17229 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
17230 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
17233 return DAG.getMergeValues({ Lo, Hi }, dl);
17236 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
17237 SelectionDAG &DAG) {
17238 MVT VT = Op.getSimpleValueType();
17239 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
17240 "Unexpected funnel shift opcode!");
17243 SDValue Op0 = Op.getOperand(0);
17244 SDValue Op1 = Op.getOperand(1);
17245 SDValue Amt = Op.getOperand(2);
17247 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
17249 if (VT.isVector()) {
17250 assert(Subtarget.hasVBMI2() && "Expected VBMI2");
17253 std::swap(Op0, Op1);
17255 APInt APIntShiftAmt;
17256 if (isConstantSplat(Amt, APIntShiftAmt)) {
17257 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
17258 return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
17259 Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
17262 return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
17266 assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
17267 "Unexpected funnel shift type!");
17269 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
17270 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
17271 if (!OptForSize && Subtarget.isSHLDSlow())
17275 std::swap(Op0, Op1);
17277 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
17278 if (VT == MVT::i16)
17279 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
17280 DAG.getConstant(15, DL, Amt.getValueType()));
17282 unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
17283 return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
17286 // Try to use a packed vector operation to handle i64 on 32-bit targets when
17287 // AVX512DQ is enabled.
17288 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
17289 const X86Subtarget &Subtarget) {
17290 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
17291 Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
17292 SDValue Src = Op.getOperand(0);
17293 MVT SrcVT = Src.getSimpleValueType();
17294 MVT VT = Op.getSimpleValueType();
17296 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
17297 (VT != MVT::f32 && VT != MVT::f64))
17300 // Pack the i64 into a vector, do the operation and extract.
17302 // Using 256-bit to ensure result is 128-bits for f32 case.
17303 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
17304 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
17305 MVT VecVT = MVT::getVectorVT(VT, NumElts);
17308 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
17309 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
17310 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
17311 DAG.getIntPtrConstant(0, dl));
17314 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
17315 SelectionDAG &DAG) const {
17316 SDValue Src = Op.getOperand(0);
17317 MVT SrcVT = Src.getSimpleValueType();
17318 MVT VT = Op.getSimpleValueType();
17321 if (SrcVT.isVector()) {
17322 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
17323 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
17324 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
17325 DAG.getUNDEF(SrcVT)));
17330 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
17331 "Unknown SINT_TO_FP to lower!");
17333 // These are really Legal; return the operand so the caller accepts it as
17335 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
17337 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())
17340 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
17343 SDValue ValueToStore = Op.getOperand(0);
17344 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
17345 !Subtarget.is64Bit())
17346 // Bitcasting to f64 here allows us to do a single 64-bit store from
17347 // an SSE register, avoiding the store forwarding penalty that would come
17348 // with two 32-bit stores.
17349 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
17351 unsigned Size = SrcVT.getSizeInBits()/8;
17352 MachineFunction &MF = DAG.getMachineFunction();
17353 auto PtrVT = getPointerTy(MF.getDataLayout());
17354 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
17355 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
17356 SDValue Chain = DAG.getStore(
17357 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
17358 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
17359 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
17362 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
17364 SelectionDAG &DAG) const {
17368 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
17370 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
17372 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
17374 unsigned ByteSize = SrcVT.getSizeInBits()/8;
17376 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
17377 MachineMemOperand *MMO;
17379 int SSFI = FI->getIndex();
17380 MMO = DAG.getMachineFunction().getMachineMemOperand(
17381 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
17382 MachineMemOperand::MOLoad, ByteSize, ByteSize);
17384 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
17385 StackSlot = StackSlot.getOperand(1);
17387 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
17388 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
17390 Tys, Ops, SrcVT, MMO);
17393 Chain = Result.getValue(1);
17394 SDValue InFlag = Result.getValue(2);
17396 // FIXME: Currently the FST is glued to the FILD_FLAG. This
17397 // shouldn't be necessary except that RFP cannot be live across
17398 // multiple blocks. When stackifier is fixed, they can be uncoupled.
17399 MachineFunction &MF = DAG.getMachineFunction();
17400 unsigned SSFISize = Op.getValueSizeInBits()/8;
17401 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
17402 auto PtrVT = getPointerTy(MF.getDataLayout());
17403 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
17404 Tys = DAG.getVTList(MVT::Other);
17406 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
17408 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
17409 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
17410 MachineMemOperand::MOStore, SSFISize, SSFISize);
17412 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
17413 Ops, Op.getValueType(), MMO);
17414 Result = DAG.getLoad(
17415 Op.getValueType(), DL, Chain, StackSlot,
17416 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
17422 /// 64-bit unsigned integer to double expansion.
17423 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
17424 const X86Subtarget &Subtarget) {
17425 // This algorithm is not obvious. Here it is what we're trying to output:
17428 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
17429 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
17431 haddpd %xmm0, %xmm0
17433 pshufd $0x4e, %xmm0, %xmm1
17439 LLVMContext *Context = DAG.getContext();
17441 // Build some magic constants.
17442 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
17443 Constant *C0 = ConstantDataVector::get(*Context, CV0);
17444 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
17445 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
17447 SmallVector<Constant*,2> CV1;
17449 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
17450 APInt(64, 0x4330000000000000ULL))));
17452 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
17453 APInt(64, 0x4530000000000000ULL))));
17454 Constant *C1 = ConstantVector::get(CV1);
17455 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
17457 // Load the 64-bit value into an XMM register.
17458 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
17461 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
17462 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
17463 /* Alignment = */ 16);
17465 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
17468 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
17469 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
17470 /* Alignment = */ 16);
17471 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
17472 // TODO: Are there any fast-math-flags to propagate here?
17473 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
17476 if (Subtarget.hasSSE3()) {
17477 // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
17478 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
17480 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
17481 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
17484 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
17485 DAG.getIntPtrConstant(0, dl));
17488 /// 32-bit unsigned integer to float expansion.
17489 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
17490 const X86Subtarget &Subtarget) {
17492 // FP constant to bias correct the final result.
17493 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
17496 // Load the 32-bit value into an XMM register.
17497 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
17500 // Zero out the upper parts of the register.
17501 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
17503 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
17504 DAG.getBitcast(MVT::v2f64, Load),
17505 DAG.getIntPtrConstant(0, dl));
17507 // Or the load with the bias.
17508 SDValue Or = DAG.getNode(
17509 ISD::OR, dl, MVT::v2i64,
17510 DAG.getBitcast(MVT::v2i64,
17511 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
17512 DAG.getBitcast(MVT::v2i64,
17513 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
17515 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
17516 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
17518 // Subtract the bias.
17519 // TODO: Are there any fast-math-flags to propagate here?
17520 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
17522 // Handle final rounding.
17523 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
17526 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
17527 const X86Subtarget &Subtarget,
17529 if (Op.getSimpleValueType() != MVT::v2f64)
17532 SDValue N0 = Op.getOperand(0);
17533 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
17535 // Legalize to v4i32 type.
17536 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
17537 DAG.getUNDEF(MVT::v2i32));
17539 if (Subtarget.hasAVX512())
17540 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
17542 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
17543 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
17544 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
17545 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
17547 // Two to the power of half-word-size.
17548 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
17550 // Clear upper part of LO, lower HI.
17551 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
17552 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
17554 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
17555 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
17556 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
17558 // Add the two halves.
17559 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
17562 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
17563 const X86Subtarget &Subtarget) {
17564 // The algorithm is the following:
17565 // #ifdef __SSE4_1__
17566 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
17567 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
17568 // (uint4) 0x53000000, 0xaa);
17570 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
17571 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
17573 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
17574 // return (float4) lo + fhi;
17576 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
17577 // reassociate the two FADDs, and if we do that, the algorithm fails
17578 // spectacularly (PR24512).
17579 // FIXME: If we ever have some kind of Machine FMF, this should be marked
17580 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
17581 // there's also the MachineCombiner reassociations happening on Machine IR.
17582 if (DAG.getTarget().Options.UnsafeFPMath)
17586 SDValue V = Op->getOperand(0);
17587 MVT VecIntVT = V.getSimpleValueType();
17588 bool Is128 = VecIntVT == MVT::v4i32;
17589 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
17590 // If we convert to something else than the supported type, e.g., to v4f64,
17592 if (VecFloatVT != Op->getSimpleValueType(0))
17595 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
17596 "Unsupported custom type");
17598 // In the #idef/#else code, we have in common:
17599 // - The vector of constants:
17605 // Create the splat vector for 0x4b000000.
17606 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
17607 // Create the splat vector for 0x53000000.
17608 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
17610 // Create the right shift.
17611 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
17612 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
17615 if (Subtarget.hasSSE41()) {
17616 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
17617 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
17618 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
17619 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
17620 // Low will be bitcasted right away, so do not bother bitcasting back to its
17622 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
17623 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
17624 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
17625 // (uint4) 0x53000000, 0xaa);
17626 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
17627 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
17628 // High will be bitcasted right away, so do not bother bitcasting back to
17629 // its original type.
17630 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
17631 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
17633 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
17634 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
17635 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
17636 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
17638 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
17639 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
17642 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
17643 SDValue VecCstFAdd = DAG.getConstantFP(
17644 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
17646 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
17647 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
17648 // TODO: Are there any fast-math-flags to propagate here?
17650 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
17651 // return (float4) lo + fhi;
17652 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
17653 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
17656 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
17657 const X86Subtarget &Subtarget) {
17658 SDValue N0 = Op.getOperand(0);
17659 MVT SrcVT = N0.getSimpleValueType();
17662 switch (SrcVT.SimpleTy) {
17664 llvm_unreachable("Custom UINT_TO_FP is not supported!");
17666 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
17669 assert(!Subtarget.hasAVX512());
17670 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
17674 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
17675 SelectionDAG &DAG) const {
17676 SDValue N0 = Op.getOperand(0);
17678 auto PtrVT = getPointerTy(DAG.getDataLayout());
17680 if (Op.getSimpleValueType().isVector())
17681 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
17683 MVT SrcVT = N0.getSimpleValueType();
17684 MVT DstVT = Op.getSimpleValueType();
17686 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
17687 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
17688 // Conversions from unsigned i32 to f32/f64 are legal,
17689 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
17693 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
17696 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
17697 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
17698 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
17699 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
17700 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
17703 // Make a 64-bit buffer, and use it to build an FILD.
17704 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
17705 if (SrcVT == MVT::i32) {
17706 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
17707 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
17708 StackSlot, MachinePointerInfo());
17709 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
17710 OffsetSlot, MachinePointerInfo());
17711 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
17715 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
17716 SDValue ValueToStore = Op.getOperand(0);
17717 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
17718 // Bitcasting to f64 here allows us to do a single 64-bit store from
17719 // an SSE register, avoiding the store forwarding penalty that would come
17720 // with two 32-bit stores.
17721 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
17722 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
17723 MachinePointerInfo());
17724 // For i64 source, we need to add the appropriate power of 2 if the input
17725 // was negative. This is the same as the optimization in
17726 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
17727 // we must be careful to do the computation in x87 extended precision, not
17728 // in SSE. (The generic code can't know it's OK to do this, or how to.)
17729 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
17730 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
17731 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
17732 MachineMemOperand::MOLoad, 8, 8);
17734 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
17735 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
17736 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
17739 APInt FF(32, 0x5F800000ULL);
17741 // Check whether the sign bit is set.
17742 SDValue SignSet = DAG.getSetCC(
17743 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
17744 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
17746 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
17747 SDValue FudgePtr = DAG.getConstantPool(
17748 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
17750 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
17751 SDValue Zero = DAG.getIntPtrConstant(0, dl);
17752 SDValue Four = DAG.getIntPtrConstant(4, dl);
17753 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
17754 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
17756 // Load the value out, extending it from f32 to f80.
17757 // FIXME: Avoid the extend by constructing the right constant pool?
17758 SDValue Fudge = DAG.getExtLoad(
17759 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
17760 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
17761 /* Alignment = */ 4);
17762 // Extend everything to 80 bits to force it to be done on x87.
17763 // TODO: Are there any fast-math-flags to propagate here?
17764 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
17765 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
17766 DAG.getIntPtrConstant(0, dl));
17769 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
17770 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
17771 // just return an <SDValue(), SDValue()> pair.
17772 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
17773 // to i16, i32 or i64, and we lower it to a legal sequence.
17774 // If lowered to the final integer result we return a <result, SDValue()> pair.
17775 // Otherwise we lower it to a sequence ending with a FIST, return a
17776 // <FIST, StackSlot> pair, and the caller is responsible for loading
17777 // the final integer result from StackSlot.
17778 std::pair<SDValue,SDValue>
17779 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
17780 bool IsSigned, bool IsReplace) const {
17783 EVT DstTy = Op.getValueType();
17784 EVT TheVT = Op.getOperand(0).getValueType();
17785 auto PtrVT = getPointerTy(DAG.getDataLayout());
17787 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
17788 // f16 must be promoted before using the lowering in this routine.
17789 // fp128 does not use this lowering.
17790 return std::make_pair(SDValue(), SDValue());
17793 // If using FIST to compute an unsigned i64, we'll need some fixup
17794 // to handle values above the maximum signed i64. A FIST is always
17795 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
17796 bool UnsignedFixup = !IsSigned &&
17797 DstTy == MVT::i64 &&
17798 (!Subtarget.is64Bit() ||
17799 !isScalarFPTypeInSSEReg(TheVT));
17801 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
17802 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
17803 // The low 32 bits of the fist result will have the correct uint32 result.
17804 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
17808 assert(DstTy.getSimpleVT() <= MVT::i64 &&
17809 DstTy.getSimpleVT() >= MVT::i16 &&
17810 "Unknown FP_TO_INT to lower!");
17812 // These are really Legal.
17813 if (DstTy == MVT::i32 &&
17814 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
17815 return std::make_pair(SDValue(), SDValue());
17816 if (Subtarget.is64Bit() &&
17817 DstTy == MVT::i64 &&
17818 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
17819 return std::make_pair(SDValue(), SDValue());
17821 // We lower FP->int64 into FISTP64 followed by a load from a temporary
17823 MachineFunction &MF = DAG.getMachineFunction();
17824 unsigned MemSize = DstTy.getSizeInBits()/8;
17825 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
17826 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
17829 switch (DstTy.getSimpleVT().SimpleTy) {
17830 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
17831 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
17832 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
17833 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
17836 SDValue Chain = DAG.getEntryNode();
17837 SDValue Value = Op.getOperand(0);
17838 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
17840 if (UnsignedFixup) {
17842 // Conversion to unsigned i64 is implemented with a select,
17843 // depending on whether the source value fits in the range
17844 // of a signed i64. Let Thresh be the FP equivalent of
17845 // 0x8000000000000000ULL.
17847 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
17848 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
17849 // Fist-to-mem64 FistSrc
17850 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
17851 // to XOR'ing the high 32 bits with Adjust.
17853 // Being a power of 2, Thresh is exactly representable in all FP formats.
17854 // For X87 we'd like to use the smallest FP type for this constant, but
17855 // for DAG type consistency we have to match the FP operand type.
17857 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
17858 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
17859 bool LosesInfo = false;
17860 if (TheVT == MVT::f64)
17861 // The rounding mode is irrelevant as the conversion should be exact.
17862 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
17864 else if (TheVT == MVT::f80)
17865 Status = Thresh.convert(APFloat::x87DoubleExtended(),
17866 APFloat::rmNearestTiesToEven, &LosesInfo);
17868 assert(Status == APFloat::opOK && !LosesInfo &&
17869 "FP conversion should have been exact");
17871 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
17873 SDValue Cmp = DAG.getSetCC(DL,
17874 getSetCCResultType(DAG.getDataLayout(),
17875 *DAG.getContext(), TheVT),
17876 Value, ThreshVal, ISD::SETLT);
17877 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
17878 DAG.getConstant(0, DL, MVT::i32),
17879 DAG.getConstant(0x80000000, DL, MVT::i32));
17880 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
17881 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
17882 *DAG.getContext(), TheVT),
17883 Value, ThreshVal, ISD::SETLT);
17884 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
17887 // FIXME This causes a redundant load/store if the SSE-class value is already
17888 // in memory, such as if it is on the callstack.
17889 if (isScalarFPTypeInSSEReg(TheVT)) {
17890 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
17891 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
17892 MachinePointerInfo::getFixedStack(MF, SSFI));
17893 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
17895 Chain, StackSlot, DAG.getValueType(TheVT)
17898 MachineMemOperand *MMO =
17899 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
17900 MachineMemOperand::MOLoad, MemSize, MemSize);
17901 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
17902 Chain = Value.getValue(1);
17903 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
17904 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
17907 MachineMemOperand *MMO =
17908 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
17909 MachineMemOperand::MOStore, MemSize, MemSize);
17911 if (UnsignedFixup) {
17913 // Insert the FIST, load its result as two i32's,
17914 // and XOR the high i32 with Adjust.
17916 SDValue FistOps[] = { Chain, Value, StackSlot };
17917 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
17918 FistOps, DstTy, MMO);
17921 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
17922 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
17925 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
17926 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
17928 if (Subtarget.is64Bit()) {
17929 // Join High32 and Low32 into a 64-bit result.
17930 // (High32 << 32) | Low32
17931 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
17932 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
17933 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
17934 DAG.getConstant(32, DL, MVT::i8));
17935 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
17936 return std::make_pair(Result, SDValue());
17939 SDValue ResultOps[] = { Low32, High32 };
17941 SDValue pair = IsReplace
17942 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
17943 : DAG.getMergeValues(ResultOps, DL);
17944 return std::make_pair(pair, SDValue());
17946 // Build the FP_TO_INT*_IN_MEM
17947 SDValue Ops[] = { Chain, Value, StackSlot };
17948 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
17950 return std::make_pair(FIST, StackSlot);
17954 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
17955 const X86Subtarget &Subtarget) {
17956 MVT VT = Op->getSimpleValueType(0);
17957 SDValue In = Op->getOperand(0);
17958 MVT InVT = In.getSimpleValueType();
17961 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
17962 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
17963 "Expected same number of elements");
17964 assert((VT.getVectorElementType() == MVT::i16 ||
17965 VT.getVectorElementType() == MVT::i32 ||
17966 VT.getVectorElementType() == MVT::i64) &&
17967 "Unexpected element type");
17968 assert((InVT.getVectorElementType() == MVT::i8 ||
17969 InVT.getVectorElementType() == MVT::i16 ||
17970 InVT.getVectorElementType() == MVT::i32) &&
17971 "Unexpected element type");
17973 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
17974 if (InVT == MVT::v8i8) {
17975 if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
17978 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
17979 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
17980 // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input.
17981 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In);
17984 if (Subtarget.hasInt256())
17987 // Optimize vectors in AVX mode:
17990 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
17991 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
17992 // Concat upper and lower parts.
17995 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
17996 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
17997 // Concat upper and lower parts.
18000 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18001 VT.getVectorNumElements() / 2);
18003 SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In);
18005 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
18006 SDValue Undef = DAG.getUNDEF(InVT);
18007 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
18008 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
18009 OpHi = DAG.getBitcast(HalfVT, OpHi);
18011 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18014 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
18015 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
18016 const SDLoc &dl, SelectionDAG &DAG) {
18017 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
18018 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
18019 DAG.getIntPtrConstant(0, dl));
18020 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
18021 DAG.getIntPtrConstant(8, dl));
18022 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
18023 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
18024 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
18025 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18028 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
18029 const X86Subtarget &Subtarget,
18030 SelectionDAG &DAG) {
18031 MVT VT = Op->getSimpleValueType(0);
18032 SDValue In = Op->getOperand(0);
18033 MVT InVT = In.getSimpleValueType();
18034 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
18036 unsigned NumElts = VT.getVectorNumElements();
18038 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
18039 // avoids a constant pool load.
18040 if (VT.getVectorElementType() != MVT::i8) {
18041 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
18042 return DAG.getNode(ISD::SRL, DL, VT, Extend,
18043 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
18046 // Extend VT if BWI is not supported.
18048 if (!Subtarget.hasBWI()) {
18049 // If v16i32 is to be avoided, we'll need to split and concatenate.
18050 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
18051 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
18053 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
18056 // Widen to 512-bits if VLX is not supported.
18057 MVT WideVT = ExtVT;
18058 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
18059 NumElts *= 512 / ExtVT.getSizeInBits();
18060 InVT = MVT::getVectorVT(MVT::i1, NumElts);
18061 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
18062 In, DAG.getIntPtrConstant(0, DL));
18063 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
18067 SDValue One = DAG.getConstant(1, DL, WideVT);
18068 SDValue Zero = DAG.getConstant(0, DL, WideVT);
18070 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
18072 // Truncate if we had to extend above.
18074 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
18075 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
18078 // Extract back to 128/256-bit if we widened.
18080 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
18081 DAG.getIntPtrConstant(0, DL));
18083 return SelectedVal;
18086 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18087 SelectionDAG &DAG) {
18088 SDValue In = Op.getOperand(0);
18089 MVT SVT = In.getSimpleValueType();
18091 if (SVT.getVectorElementType() == MVT::i1)
18092 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
18094 assert(Subtarget.hasAVX() && "Expected AVX support");
18095 return LowerAVXExtend(Op, DAG, Subtarget);
18098 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
18099 /// It makes use of the fact that vectors with enough leading sign/zero bits
18100 /// prevent the PACKSS/PACKUS from saturating the results.
18101 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
18102 /// within each 128-bit lane.
18103 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
18104 const SDLoc &DL, SelectionDAG &DAG,
18105 const X86Subtarget &Subtarget) {
18106 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
18107 "Unexpected PACK opcode");
18108 assert(DstVT.isVector() && "VT not a vector?");
18110 // Requires SSE2 but AVX512 has fast vector truncate.
18111 if (!Subtarget.hasSSE2())
18114 EVT SrcVT = In.getValueType();
18116 // No truncation required, we might get here due to recursive calls.
18117 if (SrcVT == DstVT)
18120 // We only support vector truncation to 64bits or greater from a
18121 // 128bits or greater source.
18122 unsigned DstSizeInBits = DstVT.getSizeInBits();
18123 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
18124 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
18127 unsigned NumElems = SrcVT.getVectorNumElements();
18128 if (!isPowerOf2_32(NumElems))
18131 LLVMContext &Ctx = *DAG.getContext();
18132 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
18133 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
18135 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
18137 // Pack to the largest type possible:
18138 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
18139 EVT InVT = MVT::i16, OutVT = MVT::i8;
18140 if (SrcVT.getScalarSizeInBits() > 16 &&
18141 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
18146 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
18147 if (SrcVT.is128BitVector()) {
18148 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
18149 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
18150 In = DAG.getBitcast(InVT, In);
18151 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
18152 Res = extractSubVector(Res, 0, DAG, DL, 64);
18153 return DAG.getBitcast(DstVT, Res);
18156 // Extract lower/upper subvectors.
18157 unsigned NumSubElts = NumElems / 2;
18158 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
18159 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
18161 unsigned SubSizeInBits = SrcSizeInBits / 2;
18162 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
18163 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
18165 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
18166 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
18167 Lo = DAG.getBitcast(InVT, Lo);
18168 Hi = DAG.getBitcast(InVT, Hi);
18169 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
18170 return DAG.getBitcast(DstVT, Res);
18173 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
18174 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
18175 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
18176 Lo = DAG.getBitcast(InVT, Lo);
18177 Hi = DAG.getBitcast(InVT, Hi);
18178 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
18180 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
18181 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
18182 Res = DAG.getBitcast(MVT::v4i64, Res);
18183 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
18185 if (DstVT.is256BitVector())
18186 return DAG.getBitcast(DstVT, Res);
18188 // If 512bit -> 128bit truncate another stage.
18189 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
18190 Res = DAG.getBitcast(PackedVT, Res);
18191 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
18194 // Recursively pack lower/upper subvectors, concat result and pack again.
18195 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
18196 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
18197 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
18198 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
18200 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
18201 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
18202 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
18205 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
18206 const X86Subtarget &Subtarget) {
18209 MVT VT = Op.getSimpleValueType();
18210 SDValue In = Op.getOperand(0);
18211 MVT InVT = In.getSimpleValueType();
18213 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
18215 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
18216 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
18217 if (InVT.getScalarSizeInBits() <= 16) {
18218 if (Subtarget.hasBWI()) {
18219 // legal, will go to VPMOVB2M, VPMOVW2M
18220 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
18221 // We need to shift to get the lsb into sign position.
18222 // Shift packed bytes not supported natively, bitcast to word
18223 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
18224 In = DAG.getNode(ISD::SHL, DL, ExtVT,
18225 DAG.getBitcast(ExtVT, In),
18226 DAG.getConstant(ShiftInx, DL, ExtVT));
18227 In = DAG.getBitcast(InVT, In);
18229 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
18232 // Use TESTD/Q, extended vector to packed dword/qword.
18233 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
18234 "Unexpected vector type.");
18235 unsigned NumElts = InVT.getVectorNumElements();
18236 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
18237 // We need to change to a wider element type that we have support for.
18238 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
18239 // For 16 element vectors we extend to v16i32 unless we are explicitly
18240 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
18241 // we need to split into two 8 element vectors which we can extend to v8i32,
18242 // truncate and concat the results. There's an additional complication if
18243 // the original type is v16i8. In that case we can't split the v16i8 so
18244 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
18245 // to v8i32, truncate that to v8i1 and concat the two halves.
18246 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
18247 if (InVT == MVT::v16i8) {
18248 // First we need to sign extend up to 256-bits so we can split that.
18249 InVT = MVT::v16i16;
18250 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
18252 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
18253 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
18254 // We're split now, just emit two truncates and a concat. The two
18255 // truncates will trigger legalization to come back to this function.
18256 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
18257 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
18258 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18260 // We either have 8 elements or we're allowed to use 512-bit vectors.
18261 // If we have VLX, we want to use the narrowest vector that can get the
18262 // job done so we use vXi32.
18263 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
18264 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
18265 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
18267 ShiftInx = InVT.getScalarSizeInBits() - 1;
18270 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
18271 // We need to shift to get the lsb into sign position.
18272 In = DAG.getNode(ISD::SHL, DL, InVT, In,
18273 DAG.getConstant(ShiftInx, DL, InVT));
18275 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
18276 if (Subtarget.hasDQI())
18277 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
18278 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
18281 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
18283 MVT VT = Op.getSimpleValueType();
18284 SDValue In = Op.getOperand(0);
18285 MVT InVT = In.getSimpleValueType();
18286 unsigned InNumEltBits = InVT.getScalarSizeInBits();
18288 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
18289 "Invalid TRUNCATE operation");
18291 // If called by the legalizer just return.
18292 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
18295 if (VT.getVectorElementType() == MVT::i1)
18296 return LowerTruncateVecI1(Op, DAG, Subtarget);
18298 // vpmovqb/w/d, vpmovdb/w, vpmovwb
18299 if (Subtarget.hasAVX512()) {
18300 // word to byte only under BWI. Otherwise we have to promoted to v16i32
18301 // and then truncate that. But we should only do that if we haven't been
18302 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
18303 // handled by isel patterns.
18304 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
18305 Subtarget.canExtendTo512DQ())
18309 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
18310 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
18312 // Truncate with PACKUS if we are truncating a vector with leading zero bits
18313 // that extend all the way to the packed/truncated value.
18314 // Pre-SSE41 we can only use PACKUSWB.
18315 KnownBits Known = DAG.computeKnownBits(In);
18316 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
18318 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
18321 // Truncate with PACKSS if we are truncating a vector with sign-bits that
18322 // extend all the way to the packed/truncated value.
18323 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
18325 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
18328 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
18329 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
18330 if (Subtarget.hasInt256()) {
18331 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
18332 In = DAG.getBitcast(MVT::v8i32, In);
18333 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
18334 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
18335 DAG.getIntPtrConstant(0, DL));
18338 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
18339 DAG.getIntPtrConstant(0, DL));
18340 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
18341 DAG.getIntPtrConstant(2, DL));
18342 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
18343 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
18344 static const int ShufMask[] = {0, 2, 4, 6};
18345 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
18348 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
18349 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
18350 if (Subtarget.hasInt256()) {
18351 In = DAG.getBitcast(MVT::v32i8, In);
18353 // The PSHUFB mask:
18354 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
18355 -1, -1, -1, -1, -1, -1, -1, -1,
18356 16, 17, 20, 21, 24, 25, 28, 29,
18357 -1, -1, -1, -1, -1, -1, -1, -1 };
18358 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
18359 In = DAG.getBitcast(MVT::v4i64, In);
18361 static const int ShufMask2[] = {0, 2, -1, -1};
18362 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
18363 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
18364 DAG.getIntPtrConstant(0, DL));
18365 return DAG.getBitcast(VT, In);
18368 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
18369 DAG.getIntPtrConstant(0, DL));
18371 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
18372 DAG.getIntPtrConstant(4, DL));
18374 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
18375 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
18377 // The PSHUFB mask:
18378 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
18379 -1, -1, -1, -1, -1, -1, -1, -1};
18381 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
18382 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
18384 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
18385 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
18387 // The MOVLHPS Mask:
18388 static const int ShufMask2[] = {0, 1, 4, 5};
18389 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
18390 return DAG.getBitcast(MVT::v8i16, res);
18393 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
18394 // Use an AND to zero uppper bits for PACKUS.
18395 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
18397 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
18398 DAG.getIntPtrConstant(0, DL));
18399 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
18400 DAG.getIntPtrConstant(8, DL));
18401 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
18404 // Handle truncation of V256 to V128 using shuffles.
18405 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
18407 assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
18409 unsigned NumElems = VT.getVectorNumElements();
18410 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
18412 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
18413 // Prepare truncation shuffle mask
18414 for (unsigned i = 0; i != NumElems; ++i)
18415 MaskVec[i] = i * 2;
18416 In = DAG.getBitcast(NVT, In);
18417 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
18418 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
18419 DAG.getIntPtrConstant(0, DL));
18422 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
18423 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
18424 MVT VT = Op.getSimpleValueType();
18426 if (VT.isVector()) {
18427 SDValue Src = Op.getOperand(0);
18430 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
18431 MVT ResVT = MVT::v4i32;
18432 MVT TruncVT = MVT::v4i1;
18433 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
18434 if (!IsSigned && !Subtarget.hasVLX()) {
18435 // Widen to 512-bits.
18436 ResVT = MVT::v8i32;
18437 TruncVT = MVT::v8i1;
18438 Opc = ISD::FP_TO_UINT;
18439 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
18440 DAG.getUNDEF(MVT::v8f64),
18441 Src, DAG.getIntPtrConstant(0, dl));
18443 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
18444 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
18445 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
18446 DAG.getIntPtrConstant(0, dl));
18449 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
18450 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
18451 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
18452 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
18453 DAG.getUNDEF(MVT::v2f32)));
18459 assert(!VT.isVector());
18461 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
18462 IsSigned, /*IsReplace=*/ false);
18463 SDValue FIST = Vals.first, StackSlot = Vals.second;
18464 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
18465 if (!FIST.getNode())
18468 if (StackSlot.getNode())
18469 // Load the result.
18470 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
18472 // The node is the result.
18476 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
18478 MVT VT = Op.getSimpleValueType();
18479 SDValue In = Op.getOperand(0);
18480 MVT SVT = In.getSimpleValueType();
18482 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
18484 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
18485 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
18486 In, DAG.getUNDEF(SVT)));
18489 /// Horizontal vector math instructions may be slower than normal math with
18490 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
18491 /// implementation, and likely shuffle complexity of the alternate sequence.
18492 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
18493 const X86Subtarget &Subtarget) {
18494 bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
18495 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
18496 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
18499 /// Depending on uarch and/or optimizing for size, we might prefer to use a
18500 /// vector operation in place of the typical scalar operation.
18501 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
18502 const X86Subtarget &Subtarget) {
18503 // If both operands have other uses, this is probably not profitable.
18504 SDValue LHS = Op.getOperand(0);
18505 SDValue RHS = Op.getOperand(1);
18506 if (!LHS.hasOneUse() && !RHS.hasOneUse())
18509 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
18510 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
18511 if (IsFP && !Subtarget.hasSSE3())
18513 if (!IsFP && !Subtarget.hasSSSE3())
18516 // Defer forming the minimal horizontal op if the vector source has more than
18517 // the 2 extract element uses that we're matching here. In that case, we might
18518 // form a horizontal op that includes more than 1 add/sub op.
18519 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18520 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18521 LHS.getOperand(0) != RHS.getOperand(0) ||
18522 !LHS.getOperand(0)->hasNUsesOfValue(2, 0))
18525 if (!isa<ConstantSDNode>(LHS.getOperand(1)) ||
18526 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
18527 !shouldUseHorizontalOp(true, DAG, Subtarget))
18530 // Allow commuted 'hadd' ops.
18531 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
18533 switch (Op.getOpcode()) {
18534 case ISD::ADD: HOpcode = X86ISD::HADD; break;
18535 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
18536 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
18537 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
18539 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
18541 unsigned LExtIndex = LHS.getConstantOperandVal(1);
18542 unsigned RExtIndex = RHS.getConstantOperandVal(1);
18543 if (LExtIndex == 1 && RExtIndex == 0 &&
18544 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
18545 std::swap(LExtIndex, RExtIndex);
18547 // TODO: This can be extended to handle other adjacent extract pairs.
18548 if (LExtIndex != 0 || RExtIndex != 1)
18551 SDValue X = LHS.getOperand(0);
18552 EVT VecVT = X.getValueType();
18553 unsigned BitWidth = VecVT.getSizeInBits();
18554 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
18555 "Not expecting illegal vector widths here");
18557 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
18558 // equivalent, so extract the 256/512-bit source op to 128-bit.
18559 // This is free: ymm/zmm -> xmm.
18561 if (BitWidth == 256 || BitWidth == 512)
18562 X = extract128BitVector(X, 0, DAG, DL);
18564 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
18565 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
18566 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
18567 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
18568 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
18569 DAG.getIntPtrConstant(0, DL));
18572 /// Depending on uarch and/or optimizing for size, we might prefer to use a
18573 /// vector operation in place of the typical scalar operation.
18574 static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
18575 const X86Subtarget &Subtarget) {
18576 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
18577 "Only expecting float/double");
18578 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
18581 /// The only differences between FABS and FNEG are the mask and the logic op.
18582 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
18583 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
18584 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
18585 "Wrong opcode for lowering FABS or FNEG.");
18587 bool IsFABS = (Op.getOpcode() == ISD::FABS);
18589 // If this is a FABS and it has an FNEG user, bail out to fold the combination
18590 // into an FNABS. We'll lower the FABS after that if it is still in use.
18592 for (SDNode *User : Op->uses())
18593 if (User->getOpcode() == ISD::FNEG)
18597 MVT VT = Op.getSimpleValueType();
18599 bool IsF128 = (VT == MVT::f128);
18600 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
18601 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
18602 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
18603 "Unexpected type in LowerFABSorFNEG");
18605 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
18606 // decide if we should generate a 16-byte constant mask when we only need 4 or
18607 // 8 bytes for the scalar case.
18609 // There are no scalar bitwise logical SSE/AVX instructions, so we
18610 // generate a 16-byte vector constant and logic op even for the scalar case.
18611 // Using a 16-byte mask allows folding the load of the mask with
18612 // the logic op, so it can save (~4 bytes) on code size.
18613 bool IsFakeVector = !VT.isVector() && !IsF128;
18616 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
18618 unsigned EltBits = VT.getScalarSizeInBits();
18619 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
18620 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
18621 APInt::getSignMask(EltBits);
18622 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
18623 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
18625 SDValue Op0 = Op.getOperand(0);
18626 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
18627 unsigned LogicOp = IsFABS ? X86ISD::FAND :
18628 IsFNABS ? X86ISD::FOR :
18630 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
18632 if (VT.isVector() || IsF128)
18633 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
18635 // For the scalar case extend to a 128-bit vector, perform the logic op,
18636 // and extract the scalar result back out.
18637 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
18638 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
18639 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
18640 DAG.getIntPtrConstant(0, dl));
18643 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
18644 SDValue Mag = Op.getOperand(0);
18645 SDValue Sign = Op.getOperand(1);
18648 // If the sign operand is smaller, extend it first.
18649 MVT VT = Op.getSimpleValueType();
18650 if (Sign.getSimpleValueType().bitsLT(VT))
18651 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
18653 // And if it is bigger, shrink it first.
18654 if (Sign.getSimpleValueType().bitsGT(VT))
18655 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
18657 // At this point the operands and the result should have the same
18658 // type, and that won't be f80 since that is not custom lowered.
18659 bool IsF128 = (VT == MVT::f128);
18660 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
18661 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
18662 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
18663 "Unexpected type in LowerFCOPYSIGN");
18665 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
18667 // Perform all scalar logic operations as 16-byte vectors because there are no
18668 // scalar FP logic instructions in SSE.
18669 // TODO: This isn't necessary. If we used scalar types, we might avoid some
18670 // unnecessary splats, but we might miss load folding opportunities. Should
18671 // this decision be based on OptimizeForSize?
18672 bool IsFakeVector = !VT.isVector() && !IsF128;
18675 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
18677 // The mask constants are automatically splatted for vector types.
18678 unsigned EltSizeInBits = VT.getScalarSizeInBits();
18679 SDValue SignMask = DAG.getConstantFP(
18680 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
18681 SDValue MagMask = DAG.getConstantFP(
18682 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
18684 // First, clear all bits but the sign bit from the second operand (sign).
18686 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
18687 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
18689 // Next, clear the sign bit from the first operand (magnitude).
18690 // TODO: If we had general constant folding for FP logic ops, this check
18691 // wouldn't be necessary.
18693 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
18694 APFloat APF = Op0CN->getValueAPF();
18696 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
18698 // If the magnitude operand wasn't a constant, we need to AND out the sign.
18700 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
18701 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
18704 // OR the magnitude value with the sign bit.
18705 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
18706 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
18707 DAG.getIntPtrConstant(0, dl));
18710 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
18711 SDValue N0 = Op.getOperand(0);
18713 MVT VT = Op.getSimpleValueType();
18715 MVT OpVT = N0.getSimpleValueType();
18716 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
18717 "Unexpected type for FGETSIGN");
18719 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
18720 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
18721 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
18722 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
18723 Res = DAG.getZExtOrTrunc(Res, dl, VT);
18724 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
18728 /// Helper for creating a X86ISD::SETCC node.
18729 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
18730 SelectionDAG &DAG) {
18731 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18732 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
18735 // Check whether an OR'd tree is PTEST-able.
18736 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
18737 const X86Subtarget &Subtarget,
18740 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
18742 if (!Subtarget.hasSSE41())
18745 if (!Op->hasOneUse())
18748 SDNode *N = Op.getNode();
18751 SmallVector<SDValue, 8> Opnds;
18752 DenseMap<SDValue, unsigned> VecInMap;
18753 SmallVector<SDValue, 8> VecIns;
18754 EVT VT = MVT::Other;
18756 // Recognize a special case where a vector is casted into wide integer to
18758 Opnds.push_back(N->getOperand(0));
18759 Opnds.push_back(N->getOperand(1));
18761 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
18762 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
18763 // BFS traverse all OR'd operands.
18764 if (I->getOpcode() == ISD::OR) {
18765 Opnds.push_back(I->getOperand(0));
18766 Opnds.push_back(I->getOperand(1));
18767 // Re-evaluate the number of nodes to be traversed.
18768 e += 2; // 2 more nodes (LHS and RHS) are pushed.
18772 // Quit if a non-EXTRACT_VECTOR_ELT
18773 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
18776 // Quit if without a constant index.
18777 SDValue Idx = I->getOperand(1);
18778 if (!isa<ConstantSDNode>(Idx))
18781 SDValue ExtractedFromVec = I->getOperand(0);
18782 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
18783 if (M == VecInMap.end()) {
18784 VT = ExtractedFromVec.getValueType();
18785 // Quit if not 128/256-bit vector.
18786 if (!VT.is128BitVector() && !VT.is256BitVector())
18788 // Quit if not the same type.
18789 if (VecInMap.begin() != VecInMap.end() &&
18790 VT != VecInMap.begin()->first.getValueType())
18792 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
18793 VecIns.push_back(ExtractedFromVec);
18795 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
18798 assert((VT.is128BitVector() || VT.is256BitVector()) &&
18799 "Not extracted from 128-/256-bit vector.");
18801 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
18803 for (DenseMap<SDValue, unsigned>::const_iterator
18804 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
18805 // Quit if not all elements are used.
18806 if (I->second != FullMask)
18810 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
18812 // Cast all vectors into TestVT for PTEST.
18813 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
18814 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
18816 // If more than one full vector is evaluated, OR them first before PTEST.
18817 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
18818 // Each iteration will OR 2 nodes and append the result until there is only
18819 // 1 node left, i.e. the final OR'd value of all vectors.
18820 SDValue LHS = VecIns[Slot];
18821 SDValue RHS = VecIns[Slot + 1];
18822 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
18825 X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
18827 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
18828 VecIns.back(), VecIns.back());
18831 /// return true if \c Op has a use that doesn't just read flags.
18832 static bool hasNonFlagsUse(SDValue Op) {
18833 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
18835 SDNode *User = *UI;
18836 unsigned UOpNo = UI.getOperandNo();
18837 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
18838 // Look pass truncate.
18839 UOpNo = User->use_begin().getOperandNo();
18840 User = *User->use_begin();
18843 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
18844 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
18850 /// Emit nodes that will be selected as "test Op0,Op0", or something
18852 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
18853 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
18854 // CF and OF aren't always set the way we want. Determine which
18855 // of these we need.
18856 bool NeedCF = false;
18857 bool NeedOF = false;
18860 case X86::COND_A: case X86::COND_AE:
18861 case X86::COND_B: case X86::COND_BE:
18864 case X86::COND_G: case X86::COND_GE:
18865 case X86::COND_L: case X86::COND_LE:
18866 case X86::COND_O: case X86::COND_NO: {
18867 // Check if we really need to set the
18868 // Overflow flag. If NoSignedWrap is present
18869 // that is not actually needed.
18870 switch (Op->getOpcode()) {
18875 if (Op.getNode()->getFlags().hasNoSignedWrap())
18885 // See if we can use the EFLAGS value from the operand instead of
18886 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
18887 // we prove that the arithmetic won't overflow, we can't use OF or CF.
18888 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
18889 // Emit a CMP with 0, which is the TEST pattern.
18890 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
18891 DAG.getConstant(0, dl, Op.getValueType()));
18893 unsigned Opcode = 0;
18894 unsigned NumOperands = 0;
18896 SDValue ArithOp = Op;
18898 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
18899 // which may be the result of a CAST. We use the variable 'Op', which is the
18900 // non-casted variable when we check for possible users.
18901 switch (ArithOp.getOpcode()) {
18903 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
18904 // because a TEST instruction will be better.
18905 if (!hasNonFlagsUse(Op))
18913 // Transform to an x86-specific ALU node with flags if there is a chance of
18914 // using an RMW op or only the flags are used. Otherwise, leave
18915 // the node alone and emit a 'test' instruction.
18916 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
18917 UE = Op.getNode()->use_end(); UI != UE; ++UI)
18918 if (UI->getOpcode() != ISD::CopyToReg &&
18919 UI->getOpcode() != ISD::SETCC &&
18920 UI->getOpcode() != ISD::STORE)
18923 // Otherwise use a regular EFLAGS-setting instruction.
18924 switch (ArithOp.getOpcode()) {
18925 default: llvm_unreachable("unexpected operator!");
18926 case ISD::ADD: Opcode = X86ISD::ADD; break;
18927 case ISD::SUB: Opcode = X86ISD::SUB; break;
18928 case ISD::XOR: Opcode = X86ISD::XOR; break;
18929 case ISD::AND: Opcode = X86ISD::AND; break;
18930 case ISD::OR: Opcode = X86ISD::OR; break;
18940 return SDValue(Op.getNode(), 1);
18947 // Emit a CMP with 0, which is the TEST pattern.
18948 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
18949 DAG.getConstant(0, dl, Op.getValueType()));
18951 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18952 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
18954 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
18955 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
18956 return SDValue(New.getNode(), 1);
18959 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
18961 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
18962 const SDLoc &dl, SelectionDAG &DAG) const {
18963 if (isNullConstant(Op1))
18964 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
18966 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
18967 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
18968 // Only promote the compare up to I32 if it is a 16 bit operation
18969 // with an immediate. 16 bit immediates are to be avoided.
18970 if (Op0.getValueType() == MVT::i16 &&
18971 ((isa<ConstantSDNode>(Op0) &&
18972 !cast<ConstantSDNode>(Op0)->getAPIntValue().isSignedIntN(8)) ||
18973 (isa<ConstantSDNode>(Op1) &&
18974 !cast<ConstantSDNode>(Op1)->getAPIntValue().isSignedIntN(8))) &&
18975 !DAG.getMachineFunction().getFunction().optForMinSize() &&
18976 !Subtarget.isAtom()) {
18977 unsigned ExtendOp =
18978 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18979 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
18980 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
18982 // Use SUB instead of CMP to enable CSE between SUB and CMP.
18983 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
18984 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
18985 return SDValue(Sub.getNode(), 1);
18987 assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!");
18988 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
18991 /// Convert a comparison if required by the subtarget.
18992 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
18993 SelectionDAG &DAG) const {
18994 // If the subtarget does not support the FUCOMI instruction, floating-point
18995 // comparisons have to be converted.
18996 if (Subtarget.hasCMov() ||
18997 Cmp.getOpcode() != X86ISD::CMP ||
18998 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
18999 !Cmp.getOperand(1).getValueType().isFloatingPoint())
19002 // The instruction selector will select an FUCOM instruction instead of
19003 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
19004 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
19005 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
19007 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
19008 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
19009 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
19010 DAG.getConstant(8, dl, MVT::i8));
19011 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
19013 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
19014 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
19015 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
19018 /// Check if replacement of SQRT with RSQRT should be disabled.
19019 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
19020 EVT VT = Op.getValueType();
19022 // We never want to use both SQRT and RSQRT instructions for the same input.
19023 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
19027 return Subtarget.hasFastVectorFSQRT();
19028 return Subtarget.hasFastScalarFSQRT();
19031 /// The minimum architected relative accuracy is 2^-12. We need one
19032 /// Newton-Raphson step to have a good float result (24 bits of precision).
19033 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
19034 SelectionDAG &DAG, int Enabled,
19035 int &RefinementSteps,
19036 bool &UseOneConstNR,
19037 bool Reciprocal) const {
19038 EVT VT = Op.getValueType();
19040 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
19041 // It is likely not profitable to do this for f64 because a double-precision
19042 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
19043 // instructions: convert to single, rsqrtss, convert back to double, refine
19044 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
19045 // along with FMA, this could be a throughput win.
19046 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
19047 // after legalize types.
19048 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
19049 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
19050 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
19051 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
19052 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
19053 if (RefinementSteps == ReciprocalEstimate::Unspecified)
19054 RefinementSteps = 1;
19056 UseOneConstNR = false;
19057 // There is no FSQRT for 512-bits, but there is RSQRT14.
19058 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
19059 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
19064 /// The minimum architected relative accuracy is 2^-12. We need one
19065 /// Newton-Raphson step to have a good float result (24 bits of precision).
19066 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
19068 int &RefinementSteps) const {
19069 EVT VT = Op.getValueType();
19071 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
19072 // It is likely not profitable to do this for f64 because a double-precision
19073 // reciprocal estimate with refinement on x86 prior to FMA requires
19074 // 15 instructions: convert to single, rcpss, convert back to double, refine
19075 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
19076 // along with FMA, this could be a throughput win.
19078 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
19079 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
19080 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
19081 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
19082 // Enable estimate codegen with 1 refinement step for vector division.
19083 // Scalar division estimates are disabled because they break too much
19084 // real-world code. These defaults are intended to match GCC behavior.
19085 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
19088 if (RefinementSteps == ReciprocalEstimate::Unspecified)
19089 RefinementSteps = 1;
19091 // There is no FSQRT for 512-bits, but there is RCP14.
19092 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
19093 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
19098 /// If we have at least two divisions that use the same divisor, convert to
19099 /// multiplication by a reciprocal. This may need to be adjusted for a given
19100 /// CPU if a division's cost is not at least twice the cost of a multiplication.
19101 /// This is because we still need one division to calculate the reciprocal and
19102 /// then we need two multiplies by that reciprocal as replacements for the
19103 /// original divisions.
19104 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
19108 /// Result of 'and' is compared against zero. Change to a BT node if possible.
19109 /// Returns the BT node and the condition code needed to use it.
19110 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
19111 const SDLoc &dl, SelectionDAG &DAG,
19113 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
19114 SDValue Op0 = And.getOperand(0);
19115 SDValue Op1 = And.getOperand(1);
19116 if (Op0.getOpcode() == ISD::TRUNCATE)
19117 Op0 = Op0.getOperand(0);
19118 if (Op1.getOpcode() == ISD::TRUNCATE)
19119 Op1 = Op1.getOperand(0);
19121 SDValue Src, BitNo;
19122 if (Op1.getOpcode() == ISD::SHL)
19123 std::swap(Op0, Op1);
19124 if (Op0.getOpcode() == ISD::SHL) {
19125 if (isOneConstant(Op0.getOperand(0))) {
19126 // If we looked past a truncate, check that it's only truncating away
19128 unsigned BitWidth = Op0.getValueSizeInBits();
19129 unsigned AndBitWidth = And.getValueSizeInBits();
19130 if (BitWidth > AndBitWidth) {
19131 KnownBits Known = DAG.computeKnownBits(Op0);
19132 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
19136 BitNo = Op0.getOperand(1);
19138 } else if (Op1.getOpcode() == ISD::Constant) {
19139 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
19140 uint64_t AndRHSVal = AndRHS->getZExtValue();
19141 SDValue AndLHS = Op0;
19143 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
19144 Src = AndLHS.getOperand(0);
19145 BitNo = AndLHS.getOperand(1);
19147 // Use BT if the immediate can't be encoded in a TEST instruction or we
19148 // are optimizing for size and the immedaite won't fit in a byte.
19149 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
19150 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
19151 isPowerOf2_64(AndRHSVal)) {
19153 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
19154 Src.getValueType());
19159 // No patterns found, give up.
19160 if (!Src.getNode())
19163 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
19164 // instruction. Since the shift amount is in-range-or-undefined, we know
19165 // that doing a bittest on the i32 value is ok. We extend to i32 because
19166 // the encoding for the i16 version is larger than the i32 version.
19167 // Also promote i16 to i32 for performance / code size reason.
19168 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
19169 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
19171 // See if we can use the 32-bit instruction instead of the 64-bit one for a
19172 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
19173 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
19174 // known to be zero.
19175 if (Src.getValueType() == MVT::i64 &&
19176 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
19177 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
19179 // If the operand types disagree, extend the shift amount to match. Since
19180 // BT ignores high bits (like shifts) we can use anyextend.
19181 if (Src.getValueType() != BitNo.getValueType())
19182 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
19184 X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
19186 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
19189 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
19191 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
19196 // SSE Condition code mapping:
19205 switch (SetCCOpcode) {
19206 default: llvm_unreachable("Unexpected SETCC condition");
19208 case ISD::SETEQ: SSECC = 0; break;
19210 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
19212 case ISD::SETOLT: SSECC = 1; break;
19214 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
19216 case ISD::SETOLE: SSECC = 2; break;
19217 case ISD::SETUO: SSECC = 3; break;
19219 case ISD::SETNE: SSECC = 4; break;
19220 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
19221 case ISD::SETUGE: SSECC = 5; break;
19222 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
19223 case ISD::SETUGT: SSECC = 6; break;
19224 case ISD::SETO: SSECC = 7; break;
19225 case ISD::SETUEQ: SSECC = 8; break;
19226 case ISD::SETONE: SSECC = 12; break;
19229 std::swap(Op0, Op1);
19234 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
19235 /// concatenate the result back.
19236 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
19237 MVT VT = Op.getSimpleValueType();
19239 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
19240 "Unsupported value type for operation");
19242 unsigned NumElems = VT.getVectorNumElements();
19244 SDValue CC = Op.getOperand(2);
19246 // Extract the LHS vectors
19247 SDValue LHS = Op.getOperand(0);
19248 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
19249 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
19251 // Extract the RHS vectors
19252 SDValue RHS = Op.getOperand(1);
19253 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
19254 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
19256 // Issue the operation on the smaller types and concatenate the result back
19257 MVT EltVT = VT.getVectorElementType();
19258 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19259 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19260 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
19261 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
19264 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
19266 SDValue Op0 = Op.getOperand(0);
19267 SDValue Op1 = Op.getOperand(1);
19268 SDValue CC = Op.getOperand(2);
19269 MVT VT = Op.getSimpleValueType();
19272 assert(VT.getVectorElementType() == MVT::i1 &&
19273 "Cannot set masked compare for this operation");
19275 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
19277 // If this is a seteq make sure any build vectors of all zeros are on the RHS.
19278 // This helps with vptestm matching.
19279 // TODO: Should we just canonicalize the setcc during DAG combine?
19280 if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
19281 ISD::isBuildVectorAllZeros(Op0.getNode()))
19282 std::swap(Op0, Op1);
19284 // Prefer SETGT over SETLT.
19285 if (SetCCOpcode == ISD::SETLT) {
19286 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
19287 std::swap(Op0, Op1);
19290 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
19293 /// Given a simple buildvector constant, return a new vector constant with each
19294 /// element decremented. If decrementing would result in underflow or this
19295 /// is not a simple vector constant, return an empty value.
19296 static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
19297 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
19301 MVT VT = V.getSimpleValueType();
19302 MVT EltVT = VT.getVectorElementType();
19303 unsigned NumElts = VT.getVectorNumElements();
19304 SmallVector<SDValue, 8> NewVecC;
19306 for (unsigned i = 0; i < NumElts; ++i) {
19307 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
19308 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
19311 // Avoid underflow.
19312 if (Elt->getAPIntValue().isNullValue())
19315 NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
19318 return DAG.getBuildVector(VT, DL, NewVecC);
19321 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
19323 /// t = psubus Op0, Op1
19324 /// pcmpeq t, <0..0>
19325 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
19326 ISD::CondCode Cond, const SDLoc &dl,
19327 const X86Subtarget &Subtarget,
19328 SelectionDAG &DAG) {
19329 if (!Subtarget.hasSSE2())
19332 MVT VET = VT.getVectorElementType();
19333 if (VET != MVT::i8 && VET != MVT::i16)
19339 case ISD::SETULT: {
19340 // If the comparison is against a constant we can turn this into a
19341 // setule. With psubus, setule does not require a swap. This is
19342 // beneficial because the constant in the register is no longer
19343 // destructed as the destination so it can be hoisted out of a loop.
19344 // Only do this pre-AVX since vpcmp* is no longer destructive.
19345 if (Subtarget.hasAVX())
19347 SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
19353 // Psubus is better than flip-sign because it requires no inversion.
19355 std::swap(Op0, Op1);
19361 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
19362 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
19363 DAG.getConstant(0, dl, VT));
19366 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
19367 SelectionDAG &DAG) {
19368 SDValue Op0 = Op.getOperand(0);
19369 SDValue Op1 = Op.getOperand(1);
19370 SDValue CC = Op.getOperand(2);
19371 MVT VT = Op.getSimpleValueType();
19372 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
19373 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
19378 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
19379 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
19383 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
19384 assert(VT.getVectorNumElements() <= 16);
19385 Opc = X86ISD::CMPM;
19387 Opc = X86ISD::CMPP;
19388 // The SSE/AVX packed FP comparison nodes are defined with a
19389 // floating-point vector result that matches the operand type. This allows
19390 // them to work with an SSE1 target (integer vector types are not legal).
19391 VT = Op0.getSimpleValueType();
19394 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
19395 // emit two comparisons and a logic op to tie them together.
19397 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
19398 if (SSECC >= 8 && !Subtarget.hasAVX()) {
19399 // LLVM predicate is SETUEQ or SETONE.
19401 unsigned CombineOpc;
19402 if (Cond == ISD::SETUEQ) {
19405 CombineOpc = X86ISD::FOR;
19407 assert(Cond == ISD::SETONE);
19410 CombineOpc = X86ISD::FAND;
19413 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
19414 DAG.getConstant(CC0, dl, MVT::i8));
19415 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
19416 DAG.getConstant(CC1, dl, MVT::i8));
19417 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
19419 // Handle all other FP comparisons here.
19420 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
19421 DAG.getConstant(SSECC, dl, MVT::i8));
19424 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
19425 // result type of SETCC. The bitcast is expected to be optimized away
19426 // during combining/isel.
19427 if (Opc == X86ISD::CMPP)
19428 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
19433 MVT VTOp0 = Op0.getSimpleValueType();
19434 assert(VTOp0 == Op1.getSimpleValueType() &&
19435 "Expected operands with same type!");
19436 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
19437 "Invalid number of packed elements for source and destination!");
19439 // This is being called by type legalization because v2i32 is marked custom
19440 // for result type legalization for v2f32.
19441 if (VTOp0 == MVT::v2i32)
19444 // The non-AVX512 code below works under the assumption that source and
19445 // destination types are the same.
19446 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
19447 "Value types for source and destination must be the same!");
19449 // Break 256-bit integer vector compare into smaller ones.
19450 if (VT.is256BitVector() && !Subtarget.hasInt256())
19451 return Lower256IntVSETCC(Op, DAG);
19453 // The result is boolean, but operands are int/float
19454 if (VT.getVectorElementType() == MVT::i1) {
19455 // In AVX-512 architecture setcc returns mask with i1 elements,
19456 // But there is no compare instruction for i8 and i16 elements in KNL.
19457 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
19458 "Unexpected operand type");
19459 return LowerIntVSETCC_AVX512(Op, DAG);
19462 // Lower using XOP integer comparisons.
19463 if (VT.is128BitVector() && Subtarget.hasXOP()) {
19464 // Translate compare code to XOP PCOM compare mode.
19465 unsigned CmpMode = 0;
19467 default: llvm_unreachable("Unexpected SETCC condition");
19469 case ISD::SETLT: CmpMode = 0x00; break;
19471 case ISD::SETLE: CmpMode = 0x01; break;
19473 case ISD::SETGT: CmpMode = 0x02; break;
19475 case ISD::SETGE: CmpMode = 0x03; break;
19476 case ISD::SETEQ: CmpMode = 0x04; break;
19477 case ISD::SETNE: CmpMode = 0x05; break;
19480 // Are we comparing unsigned or signed integers?
19482 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
19484 return DAG.getNode(Opc, dl, VT, Op0, Op1,
19485 DAG.getConstant(CmpMode, dl, MVT::i8));
19488 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
19489 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
19490 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
19491 SDValue BC0 = peekThroughBitcasts(Op0);
19492 if (BC0.getOpcode() == ISD::AND) {
19494 SmallVector<APInt, 64> EltBits;
19495 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
19496 VT.getScalarSizeInBits(), UndefElts,
19497 EltBits, false, false)) {
19498 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
19500 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
19506 // If this is a SETNE against the signed minimum value, change it to SETGT.
19507 // If this is a SETNE against the signed maximum value, change it to SETLT.
19508 // which will be swapped to SETGT.
19509 // Otherwise we use PCMPEQ+invert.
19511 if (Cond == ISD::SETNE &&
19512 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
19513 if (ConstValue.isMinSignedValue())
19515 else if (ConstValue.isMaxSignedValue())
19519 // If both operands are known non-negative, then an unsigned compare is the
19520 // same as a signed compare and there's no need to flip signbits.
19521 // TODO: We could check for more general simplifications here since we're
19522 // computing known bits.
19523 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
19524 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
19526 // Special case: Use min/max operations for unsigned compares.
19527 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19528 if (ISD::isUnsignedIntSetCC(Cond) &&
19529 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
19530 TLI.isOperationLegal(ISD::UMIN, VT)) {
19531 // If we have a constant operand, increment/decrement it and change the
19532 // condition to avoid an invert.
19533 // TODO: This could be extended to handle a non-splat constant by checking
19534 // that each element of the constant is not the max/null value.
19536 if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) {
19537 // X > C --> X >= (C+1) --> X == umax(X, C+1)
19538 Op1 = DAG.getConstant(C + 1, dl, VT);
19539 Cond = ISD::SETUGE;
19541 if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) {
19542 // X < C --> X <= (C-1) --> X == umin(X, C-1)
19543 Op1 = DAG.getConstant(C - 1, dl, VT);
19544 Cond = ISD::SETULE;
19546 bool Invert = false;
19549 default: llvm_unreachable("Unexpected condition code");
19550 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
19551 case ISD::SETULE: Opc = ISD::UMIN; break;
19552 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
19553 case ISD::SETUGE: Opc = ISD::UMAX; break;
19556 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
19557 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
19559 // If the logical-not of the result is required, perform that now.
19561 Result = DAG.getNOT(dl, Result, VT);
19566 // Try to use SUBUS and PCMPEQ.
19567 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
19570 // We are handling one of the integer comparisons here. Since SSE only has
19571 // GT and EQ comparisons for integer, swapping operands and multiple
19572 // operations may be required for some comparisons.
19573 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
19575 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
19576 Cond == ISD::SETGE || Cond == ISD::SETUGE;
19577 bool Invert = Cond == ISD::SETNE ||
19578 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
19581 std::swap(Op0, Op1);
19583 // Check that the operation in question is available (most are plain SSE2,
19584 // but PCMPGTQ and PCMPEQQ have different requirements).
19585 if (VT == MVT::v2i64) {
19586 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
19587 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
19589 // Since SSE has no unsigned integer comparisons, we need to flip the sign
19590 // bits of the inputs before performing those operations. The lower
19591 // compare is always unsigned.
19594 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
19596 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
19598 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
19599 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
19601 // Cast everything to the right type.
19602 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
19603 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
19605 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
19606 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
19607 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
19609 // Create masks for only the low parts/high parts of the 64 bit integers.
19610 static const int MaskHi[] = { 1, 1, 3, 3 };
19611 static const int MaskLo[] = { 0, 0, 2, 2 };
19612 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
19613 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
19614 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
19616 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
19617 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
19620 Result = DAG.getNOT(dl, Result, MVT::v4i32);
19622 return DAG.getBitcast(VT, Result);
19625 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
19626 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
19627 // pcmpeqd + pshufd + pand.
19628 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
19630 // First cast everything to the right type.
19631 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
19632 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
19635 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
19637 // Make sure the lower and upper halves are both all-ones.
19638 static const int Mask[] = { 1, 0, 3, 2 };
19639 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
19640 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
19643 Result = DAG.getNOT(dl, Result, MVT::v4i32);
19645 return DAG.getBitcast(VT, Result);
19649 // Since SSE has no unsigned integer comparisons, we need to flip the sign
19650 // bits of the inputs before performing those operations.
19652 MVT EltVT = VT.getVectorElementType();
19653 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
19655 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
19656 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
19659 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
19661 // If the logical-not of the result is required, perform that now.
19663 Result = DAG.getNOT(dl, Result, VT);
19668 // Try to select this as a KORTEST+SETCC if possible.
19669 static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
19670 const SDLoc &dl, SelectionDAG &DAG,
19671 const X86Subtarget &Subtarget,
19673 // Only support equality comparisons.
19674 if (CC != ISD::SETEQ && CC != ISD::SETNE)
19677 // Must be a bitcast from vXi1.
19678 if (Op0.getOpcode() != ISD::BITCAST)
19681 Op0 = Op0.getOperand(0);
19682 MVT VT = Op0.getSimpleValueType();
19683 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
19684 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
19685 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
19688 X86::CondCode X86Cond;
19689 if (isNullConstant(Op1)) {
19690 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
19691 } else if (isAllOnesConstant(Op1)) {
19692 // C flag is set for all ones.
19693 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
19697 // If the input is an OR, we can combine it's operands into the KORTEST.
19700 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
19701 LHS = Op0.getOperand(0);
19702 RHS = Op0.getOperand(1);
19705 X86CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19706 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19709 /// Emit flags for the given setcc condition and operands. Also returns the
19710 /// corresponding X86 condition code constant in X86CC.
19711 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
19712 ISD::CondCode CC, const SDLoc &dl,
19714 SDValue &X86CC) const {
19715 // Optimize to BT if possible.
19716 // Lower (X & (1 << N)) == 0 to BT(X, N).
19717 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
19718 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
19719 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
19720 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
19721 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
19725 // Try to use PTEST for a tree ORs equality compared with 0.
19726 // TODO: We could do AND tree with all 1s as well by using the C flag.
19727 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
19728 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
19729 if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
19733 // Try to lower using KORTEST.
19734 if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
19737 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
19739 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
19740 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
19741 // If the input is a setcc, then reuse the input setcc or use a new one with
19742 // the inverted condition.
19743 if (Op0.getOpcode() == X86ISD::SETCC) {
19744 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
19746 X86CC = Op0.getOperand(0);
19748 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
19749 CCode = X86::GetOppositeBranchCondition(CCode);
19750 X86CC = DAG.getConstant(CCode, dl, MVT::i8);
19753 return Op0.getOperand(1);
19757 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
19758 X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
19759 if (CondCode == X86::COND_INVALID)
19762 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
19763 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
19764 X86CC = DAG.getConstant(CondCode, dl, MVT::i8);
19768 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
19770 MVT VT = Op.getSimpleValueType();
19772 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
19774 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
19775 SDValue Op0 = Op.getOperand(0);
19776 SDValue Op1 = Op.getOperand(1);
19778 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19781 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
19785 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
19788 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
19789 SDValue LHS = Op.getOperand(0);
19790 SDValue RHS = Op.getOperand(1);
19791 SDValue Carry = Op.getOperand(2);
19792 SDValue Cond = Op.getOperand(3);
19795 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
19796 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
19798 // Recreate the carry if needed.
19799 EVT CarryVT = Carry.getValueType();
19800 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
19801 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
19802 Carry, DAG.getConstant(NegOne, DL, CarryVT));
19804 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19805 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
19806 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
19809 // This function returns three things: the arithmetic computation itself
19810 // (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
19811 // flag and the condition code define the case in which the arithmetic
19812 // computation overflows.
19813 static std::pair<SDValue, SDValue>
19814 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
19815 assert(Op.getResNo() == 0 && "Unexpected result number!");
19816 SDValue Value, Overflow;
19817 SDValue LHS = Op.getOperand(0);
19818 SDValue RHS = Op.getOperand(1);
19819 unsigned BaseOp = 0;
19821 switch (Op.getOpcode()) {
19822 default: llvm_unreachable("Unknown ovf instruction!");
19824 BaseOp = X86ISD::ADD;
19825 Cond = X86::COND_O;
19828 BaseOp = X86ISD::ADD;
19829 Cond = X86::COND_B;
19832 BaseOp = X86ISD::SUB;
19833 Cond = X86::COND_O;
19836 BaseOp = X86ISD::SUB;
19837 Cond = X86::COND_B;
19840 BaseOp = X86ISD::SMUL;
19841 Cond = X86::COND_O;
19844 BaseOp = X86ISD::UMUL;
19845 Cond = X86::COND_O;
19850 // Also sets EFLAGS.
19851 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19852 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19853 Overflow = Value.getValue(1);
19856 return std::make_pair(Value, Overflow);
19859 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19860 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19861 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19862 // looks for this combo and may remove the "setcc" instruction if the "setcc"
19863 // has only one use.
19865 X86::CondCode Cond;
19866 SDValue Value, Overflow;
19867 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
19869 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
19870 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
19873 /// Return true if opcode is a X86 logical comparison.
19874 static bool isX86LogicalCmp(SDValue Op) {
19875 unsigned Opc = Op.getOpcode();
19876 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
19877 Opc == X86ISD::SAHF)
19879 if (Op.getResNo() == 1 &&
19880 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
19881 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
19882 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
19888 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
19889 if (V.getOpcode() != ISD::TRUNCATE)
19892 SDValue VOp0 = V.getOperand(0);
19893 unsigned InBits = VOp0.getValueSizeInBits();
19894 unsigned Bits = V.getValueSizeInBits();
19895 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
19898 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
19899 bool AddTest = true;
19900 SDValue Cond = Op.getOperand(0);
19901 SDValue Op1 = Op.getOperand(1);
19902 SDValue Op2 = Op.getOperand(2);
19904 MVT VT = Op1.getSimpleValueType();
19907 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
19908 // are available or VBLENDV if AVX is available.
19909 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
19910 if (Cond.getOpcode() == ISD::SETCC &&
19911 ((Subtarget.hasSSE2() && VT == MVT::f64) ||
19912 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
19913 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
19914 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
19915 unsigned SSECC = translateX86FSETCC(
19916 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
19918 if (Subtarget.hasAVX512()) {
19919 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
19920 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
19921 assert(!VT.isVector() && "Not a scalar type?");
19922 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
19925 if (SSECC < 8 || Subtarget.hasAVX()) {
19926 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
19927 DAG.getConstant(SSECC, DL, MVT::i8));
19929 // If we have AVX, we can use a variable vector select (VBLENDV) instead
19930 // of 3 logic instructions for size savings and potentially speed.
19931 // Unfortunately, there is no scalar form of VBLENDV.
19933 // If either operand is a +0.0 constant, don't try this. We can expect to
19934 // optimize away at least one of the logic instructions later in that
19935 // case, so that sequence would be faster than a variable blend.
19937 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
19938 // uses XMM0 as the selection register. That may need just as many
19939 // instructions as the AND/ANDN/OR sequence due to register moves, so
19941 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
19942 !isNullFPConstant(Op2)) {
19943 // Convert to vectors, do a VSELECT, and convert back to scalar.
19944 // All of the conversions should be optimized away.
19945 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
19946 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
19947 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
19948 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
19950 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
19951 VCmp = DAG.getBitcast(VCmpVT, VCmp);
19953 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
19955 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19956 VSel, DAG.getIntPtrConstant(0, DL));
19958 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
19959 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
19960 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
19964 // AVX512 fallback is to lower selects of scalar floats to masked moves.
19965 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
19966 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
19967 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
19970 // For v64i1 without 64-bit support we need to split and rejoin.
19971 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
19972 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
19973 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
19974 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
19975 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
19976 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
19977 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
19978 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
19979 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
19982 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
19984 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
19985 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
19986 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
19987 Op1Scalar = Op1.getOperand(0);
19989 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
19990 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
19991 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
19992 Op2Scalar = Op2.getOperand(0);
19993 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
19994 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
19995 Op1Scalar, Op2Scalar);
19996 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
19997 return DAG.getBitcast(VT, newSelect);
19998 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
19999 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
20000 DAG.getIntPtrConstant(0, DL));
20004 if (Cond.getOpcode() == ISD::SETCC) {
20005 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
20007 // If the condition was updated, it's possible that the operands of the
20008 // select were also updated (for example, EmitTest has a RAUW). Refresh
20009 // the local references to the select operands in case they got stale.
20010 Op1 = Op.getOperand(1);
20011 Op2 = Op.getOperand(2);
20015 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
20016 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
20017 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
20018 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
20019 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
20020 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
20021 if (Cond.getOpcode() == X86ISD::SETCC &&
20022 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
20023 isNullConstant(Cond.getOperand(1).getOperand(1))) {
20024 SDValue Cmp = Cond.getOperand(1);
20025 unsigned CondCode =
20026 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
20028 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
20029 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
20030 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
20031 SDValue CmpOp0 = Cmp.getOperand(0);
20033 // Apply further optimizations for special cases
20034 // (select (x != 0), -1, 0) -> neg & sbb
20035 // (select (x == 0), 0, -1) -> neg & sbb
20036 if (isNullConstant(Y) &&
20037 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
20038 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
20039 SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
20040 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20041 Zero = DAG.getConstant(0, DL, Op.getValueType());
20042 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
20045 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
20046 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
20047 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
20049 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20050 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
20051 SDValue Res = // Res = 0 or -1.
20052 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
20054 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
20055 Res = DAG.getNOT(DL, Res, Res.getValueType());
20057 if (!isNullConstant(Op2))
20058 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
20060 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
20061 Cmp.getOperand(0).getOpcode() == ISD::AND &&
20062 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
20063 SDValue CmpOp0 = Cmp.getOperand(0);
20064 SDValue Src1, Src2;
20065 // true if Op2 is XOR or OR operator and one of its operands
20067 // ( a , a op b) || ( b , a op b)
20068 auto isOrXorPattern = [&]() {
20069 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
20070 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
20072 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
20079 if (isOrXorPattern()) {
20081 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
20082 // we need mask of all zeros or ones with same size of the other
20084 if (CmpSz > VT.getSizeInBits())
20085 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
20086 else if (CmpSz < VT.getSizeInBits())
20087 Neg = DAG.getNode(ISD::AND, DL, VT,
20088 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
20089 DAG.getConstant(1, DL, VT));
20092 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
20093 Neg); // -(and (x, 0x1))
20094 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
20095 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
20100 // Look past (and (setcc_carry (cmp ...)), 1).
20101 if (Cond.getOpcode() == ISD::AND &&
20102 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
20103 isOneConstant(Cond.getOperand(1)))
20104 Cond = Cond.getOperand(0);
20106 // If condition flag is set by a X86ISD::CMP, then use it as the condition
20107 // setting operand in place of the X86ISD::SETCC.
20108 unsigned CondOpcode = Cond.getOpcode();
20109 if (CondOpcode == X86ISD::SETCC ||
20110 CondOpcode == X86ISD::SETCC_CARRY) {
20111 CC = Cond.getOperand(0);
20113 SDValue Cmp = Cond.getOperand(1);
20114 unsigned Opc = Cmp.getOpcode();
20115 MVT VT = Op.getSimpleValueType();
20117 bool IllegalFPCMov = false;
20118 if (VT.isFloatingPoint() && !VT.isVector() &&
20119 !isScalarFPTypeInSSEReg(VT)) // FPStack?
20120 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
20122 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
20123 Opc == X86ISD::BT) { // FIXME
20127 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
20128 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
20129 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
20131 X86::CondCode X86Cond;
20132 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
20134 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
20139 // Look past the truncate if the high bits are known zero.
20140 if (isTruncWithZeroHighBitsInput(Cond, DAG))
20141 Cond = Cond.getOperand(0);
20143 // We know the result of AND is compared against zero. Try to match
20145 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
20147 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
20156 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
20157 Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
20158 X86::COND_NE, DL, DAG);
20161 // a < b ? -1 : 0 -> RES = ~setcc_carry
20162 // a < b ? 0 : -1 -> RES = setcc_carry
20163 // a >= b ? -1 : 0 -> RES = setcc_carry
20164 // a >= b ? 0 : -1 -> RES = ~setcc_carry
20165 if (Cond.getOpcode() == X86ISD::SUB) {
20166 Cond = ConvertCmpIfNecessary(Cond, DAG);
20167 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
20169 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
20170 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
20171 (isNullConstant(Op1) || isNullConstant(Op2))) {
20172 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
20173 DAG.getConstant(X86::COND_B, DL, MVT::i8),
20175 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
20176 return DAG.getNOT(DL, Res, Res.getValueType());
20181 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
20182 // widen the cmov and push the truncate through. This avoids introducing a new
20183 // branch during isel and doesn't add any extensions.
20184 if (Op.getValueType() == MVT::i8 &&
20185 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
20186 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
20187 if (T1.getValueType() == T2.getValueType() &&
20188 // Blacklist CopyFromReg to avoid partial register stalls.
20189 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
20190 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
20192 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
20196 // Promote i16 cmovs if it won't prevent folding a load.
20197 if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
20198 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
20199 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
20200 SDValue Ops[] = { Op2, Op1, CC, Cond };
20201 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
20202 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
20205 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
20206 // condition is true.
20207 SDValue Ops[] = { Op2, Op1, CC, Cond };
20208 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
20211 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
20212 const X86Subtarget &Subtarget,
20213 SelectionDAG &DAG) {
20214 MVT VT = Op->getSimpleValueType(0);
20215 SDValue In = Op->getOperand(0);
20216 MVT InVT = In.getSimpleValueType();
20217 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20218 MVT VTElt = VT.getVectorElementType();
20221 unsigned NumElts = VT.getVectorNumElements();
20223 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
20225 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
20226 // If v16i32 is to be avoided, we'll need to split and concatenate.
20227 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20228 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
20230 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20233 // Widen to 512-bits if VLX is not supported.
20234 MVT WideVT = ExtVT;
20235 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20236 NumElts *= 512 / ExtVT.getSizeInBits();
20237 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20238 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
20239 In, DAG.getIntPtrConstant(0, dl));
20240 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20244 MVT WideEltVT = WideVT.getVectorElementType();
20245 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
20246 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
20247 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
20249 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
20250 SDValue Zero = DAG.getConstant(0, dl, WideVT);
20251 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
20254 // Truncate if we had to extend i16/i8 above.
20256 WideVT = MVT::getVectorVT(VTElt, NumElts);
20257 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
20260 // Extract back to 128/256-bit if we widened.
20262 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
20263 DAG.getIntPtrConstant(0, dl));
20268 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20269 SelectionDAG &DAG) {
20270 SDValue In = Op->getOperand(0);
20271 MVT InVT = In.getSimpleValueType();
20273 if (InVT.getVectorElementType() == MVT::i1)
20274 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
20276 assert(Subtarget.hasAVX() && "Expected AVX support");
20277 return LowerAVXExtend(Op, DAG, Subtarget);
20280 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
20281 // For sign extend this needs to handle all vector sizes and SSE4.1 and
20282 // non-SSE4.1 targets. For zero extend this should only handle inputs of
20283 // MVT::v64i8 when BWI is not supported, but AVX512 is.
20284 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
20285 const X86Subtarget &Subtarget,
20286 SelectionDAG &DAG) {
20287 SDValue In = Op->getOperand(0);
20288 MVT VT = Op->getSimpleValueType(0);
20289 MVT InVT = In.getSimpleValueType();
20291 MVT SVT = VT.getVectorElementType();
20292 MVT InSVT = InVT.getVectorElementType();
20293 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
20295 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
20297 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
20299 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
20300 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
20301 !(VT.is512BitVector() && Subtarget.hasAVX512()))
20305 unsigned Opc = Op.getOpcode();
20306 unsigned NumElts = VT.getVectorNumElements();
20308 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
20309 // For 512-bit vectors, we need 128-bits or 256-bits.
20310 if (InVT.getSizeInBits() > 128) {
20311 // Input needs to be at least the same number of elements as output, and
20312 // at least 128-bits.
20313 int InSize = InSVT.getSizeInBits() * NumElts;
20314 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
20315 InVT = In.getSimpleValueType();
20318 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
20319 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
20320 // need to be handled here for 256/512-bit results.
20321 if (Subtarget.hasInt256()) {
20322 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
20324 if (InVT.getVectorNumElements() != NumElts)
20325 return DAG.getNode(Op.getOpcode(), dl, VT, In);
20327 // FIXME: Apparently we create inreg operations that could be regular
20330 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
20331 : ISD::ZERO_EXTEND;
20332 return DAG.getNode(ExtOpc, dl, VT, In);
20335 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
20336 if (Subtarget.hasAVX()) {
20337 assert(VT.is256BitVector() && "256-bit vector expected");
20338 int HalfNumElts = NumElts / 2;
20339 MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);
20341 unsigned NumSrcElts = InVT.getVectorNumElements();
20342 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
20343 for (int i = 0; i != HalfNumElts; ++i)
20344 HiMask[i] = HalfNumElts + i;
20346 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
20347 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
20348 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
20349 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
20352 // We should only get here for sign extend.
20353 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
20354 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
20356 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
20358 SDValue SignExt = Curr;
20360 // As SRAI is only available on i16/i32 types, we expand only up to i32
20361 // and handle i64 separately.
20362 if (InVT != MVT::v4i32) {
20363 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
20365 unsigned DestWidth = DestVT.getScalarSizeInBits();
20366 unsigned Scale = DestWidth / InSVT.getSizeInBits();
20368 unsigned InNumElts = InVT.getVectorNumElements();
20369 unsigned DestElts = DestVT.getVectorNumElements();
20371 // Build a shuffle mask that takes each input element and places it in the
20372 // MSBs of the new element size.
20373 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
20374 for (unsigned i = 0; i != DestElts; ++i)
20375 Mask[i * Scale + (Scale - 1)] = i;
20377 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
20378 Curr = DAG.getBitcast(DestVT, Curr);
20380 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
20381 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
20382 DAG.getConstant(SignExtShift, dl, MVT::i8));
20385 if (VT == MVT::v2i64) {
20386 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
20387 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
20388 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
20389 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
20390 SignExt = DAG.getBitcast(VT, SignExt);
20396 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20397 SelectionDAG &DAG) {
20398 MVT VT = Op->getSimpleValueType(0);
20399 SDValue In = Op->getOperand(0);
20400 MVT InVT = In.getSimpleValueType();
20403 if (InVT.getVectorElementType() == MVT::i1)
20404 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
20406 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20407 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
20408 "Expected same number of elements");
20409 assert((VT.getVectorElementType() == MVT::i16 ||
20410 VT.getVectorElementType() == MVT::i32 ||
20411 VT.getVectorElementType() == MVT::i64) &&
20412 "Unexpected element type");
20413 assert((InVT.getVectorElementType() == MVT::i8 ||
20414 InVT.getVectorElementType() == MVT::i16 ||
20415 InVT.getVectorElementType() == MVT::i32) &&
20416 "Unexpected element type");
20418 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
20419 if (InVT == MVT::v8i8) {
20420 if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
20423 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
20424 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
20425 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
20428 if (Subtarget.hasInt256())
20431 // Optimize vectors in AVX mode
20432 // Sign extend v8i16 to v8i32 and
20435 // Divide input vector into two parts
20436 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
20437 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
20438 // concat the vectors to original VT
20440 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
20441 VT.getVectorNumElements() / 2);
20443 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
20445 unsigned NumElems = InVT.getVectorNumElements();
20446 SmallVector<int,8> ShufMask(NumElems, -1);
20447 for (unsigned i = 0; i != NumElems/2; ++i)
20448 ShufMask[i] = i + NumElems/2;
20450 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
20451 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
20453 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20456 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
20457 SelectionDAG &DAG) {
20458 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
20460 SDValue StoredVal = St->getValue();
20462 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
20463 if (StoredVal.getValueType().isVector() &&
20464 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
20465 assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
20467 assert(!St->isTruncatingStore() && "Expected non-truncating store");
20468 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
20469 "Expected AVX512F without AVX512DQI");
20471 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
20472 DAG.getUNDEF(MVT::v16i1), StoredVal,
20473 DAG.getIntPtrConstant(0, dl));
20474 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
20475 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
20477 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
20478 St->getPointerInfo(), St->getAlignment(),
20479 St->getMemOperand()->getFlags());
20482 if (St->isTruncatingStore())
20485 MVT StoreVT = StoredVal.getSimpleValueType();
20486 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
20488 if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
20489 TargetLowering::TypeWidenVector)
20492 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
20494 MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
20495 StoreVT.getVectorNumElements() * 2);
20496 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
20497 DAG.getUNDEF(StoreVT));
20498 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
20499 MVT CastVT = MVT::getVectorVT(StVT, 2);
20500 StoredVal = DAG.getBitcast(CastVT, StoredVal);
20501 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
20502 DAG.getIntPtrConstant(0, dl));
20504 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
20505 St->getPointerInfo(), St->getAlignment(),
20506 St->getMemOperand()->getFlags());
20509 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
20510 // may emit an illegal shuffle but the expansion is still better than scalar
20511 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
20512 // we'll emit a shuffle and a arithmetic shift.
20513 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
20514 // TODO: It is possible to support ZExt by zeroing the undef values during
20515 // the shuffle phase or after the shuffle.
20516 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
20517 SelectionDAG &DAG) {
20518 MVT RegVT = Op.getSimpleValueType();
20519 assert(RegVT.isVector() && "We only custom lower vector loads.");
20520 assert(RegVT.isInteger() &&
20521 "We only custom lower integer vector loads.");
20523 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
20525 EVT MemVT = Ld->getMemoryVT();
20527 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
20528 if (RegVT.getVectorElementType() == MVT::i1) {
20529 assert(EVT(RegVT) == MemVT && "Expected non-extending load");
20530 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
20531 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
20532 "Expected AVX512F without AVX512DQI");
20534 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
20535 Ld->getPointerInfo(), Ld->getAlignment(),
20536 Ld->getMemOperand()->getFlags());
20538 // Replace chain users with the new chain.
20539 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
20541 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
20542 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
20543 DAG.getBitcast(MVT::v16i1, Val),
20544 DAG.getIntPtrConstant(0, dl));
20545 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
20548 // Nothing useful we can do without SSE2 shuffles.
20549 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
20551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20552 unsigned RegSz = RegVT.getSizeInBits();
20554 ISD::LoadExtType Ext = Ld->getExtensionType();
20556 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
20557 && "Only anyext and sext are currently implemented.");
20558 assert(MemVT != RegVT && "Cannot extend to the same type");
20559 assert(MemVT.isVector() && "Must load a vector from memory");
20561 unsigned NumElems = RegVT.getVectorNumElements();
20562 unsigned MemSz = MemVT.getSizeInBits();
20563 assert(RegSz > MemSz && "Register size must be greater than the mem size");
20565 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
20566 // The only way in which we have a legal 256-bit vector result but not the
20567 // integer 256-bit operations needed to directly lower a sextload is if we
20568 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
20569 // a 128-bit vector and a normal sign_extend to 256-bits that should get
20570 // correctly legalized. We do this late to allow the canonical form of
20571 // sextload to persist throughout the rest of the DAG combiner -- it wants
20572 // to fold together any extensions it can, and so will fuse a sign_extend
20573 // of an sextload into a sextload targeting a wider value.
20575 if (MemSz == 128) {
20576 // Just switch this to a normal load.
20577 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
20578 "it must be a legal 128-bit vector "
20580 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
20581 Ld->getPointerInfo(), Ld->getAlignment(),
20582 Ld->getMemOperand()->getFlags());
20584 assert(MemSz < 128 &&
20585 "Can't extend a type wider than 128 bits to a 256 bit vector!");
20586 // Do an sext load to a 128-bit vector type. We want to use the same
20587 // number of elements, but elements half as wide. This will end up being
20588 // recursively lowered by this routine, but will succeed as we definitely
20589 // have all the necessary features if we're using AVX1.
20591 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
20592 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
20594 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
20595 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
20596 Ld->getMemOperand()->getFlags());
20599 // Replace chain users with the new chain.
20600 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
20602 // Finally, do a normal sign-extend to the desired register.
20603 SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
20604 return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
20607 // All sizes must be a power of two.
20608 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
20609 "Non-power-of-two elements are not custom lowered!");
20611 // Attempt to load the original value using scalar loads.
20612 // Find the largest scalar type that divides the total loaded size.
20613 MVT SclrLoadTy = MVT::i8;
20614 for (MVT Tp : MVT::integer_valuetypes()) {
20615 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
20620 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
20621 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
20623 SclrLoadTy = MVT::f64;
20625 // Calculate the number of scalar loads that we need to perform
20626 // in order to load our vector from memory.
20627 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
20629 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
20630 "Can only lower sext loads with a single scalar load!");
20632 unsigned loadRegSize = RegSz;
20633 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
20636 // If we don't have BWI we won't be able to create the shuffle needed for
20638 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
20639 MemVT == MVT::v8i8)
20642 // Represent our vector as a sequence of elements which are the
20643 // largest scalar that we can load.
20644 EVT LoadUnitVecVT = EVT::getVectorVT(
20645 *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());
20647 // Represent the data using the same element type that is stored in
20648 // memory. In practice, we ''widen'' MemVT.
20650 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
20651 loadRegSize / MemVT.getScalarSizeInBits());
20653 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
20654 "Invalid vector type");
20656 // We can't shuffle using an illegal type.
20657 assert(TLI.isTypeLegal(WideVecVT) &&
20658 "We only lower types that form legal widened vector types");
20660 SmallVector<SDValue, 8> Chains;
20661 SDValue Ptr = Ld->getBasePtr();
20662 unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
20663 SDValue Increment = DAG.getConstant(OffsetInc, dl,
20664 TLI.getPointerTy(DAG.getDataLayout()));
20665 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
20667 unsigned Offset = 0;
20668 for (unsigned i = 0; i < NumLoads; ++i) {
20669 unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);
20671 // Perform a single load.
20672 SDValue ScalarLoad =
20673 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
20674 Ld->getPointerInfo().getWithOffset(Offset),
20675 NewAlign, Ld->getMemOperand()->getFlags());
20676 Chains.push_back(ScalarLoad.getValue(1));
20677 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
20678 // another round of DAGCombining.
20680 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
20682 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
20683 ScalarLoad, DAG.getIntPtrConstant(i, dl));
20685 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
20686 Offset += OffsetInc;
20689 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
20691 // Bitcast the loaded value to a vector of the original element type, in
20692 // the size of the target vector type.
20693 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
20694 unsigned SizeRatio = RegSz / MemSz;
20696 if (Ext == ISD::SEXTLOAD) {
20697 SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG);
20698 return DAG.getMergeValues({Sext, TF}, dl);
20701 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
20702 MemVT == MVT::v8i8) {
20703 SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG);
20704 return DAG.getMergeValues({Sext, TF}, dl);
20707 // Redistribute the loaded elements into the different locations.
20708 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
20709 for (unsigned i = 0; i != NumElems; ++i)
20710 ShuffleVec[i * SizeRatio] = i;
20712 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
20713 DAG.getUNDEF(WideVecVT), ShuffleVec);
20715 // Bitcast to the requested type.
20716 Shuff = DAG.getBitcast(RegVT, Shuff);
20717 return DAG.getMergeValues({Shuff, TF}, dl);
20720 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
20721 /// each of which has no other use apart from the AND / OR.
20722 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
20723 Opc = Op.getOpcode();
20724 if (Opc != ISD::OR && Opc != ISD::AND)
20726 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
20727 Op.getOperand(0).hasOneUse() &&
20728 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
20729 Op.getOperand(1).hasOneUse());
20732 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
20733 /// SETCC node has a single use.
20734 static bool isXor1OfSetCC(SDValue Op) {
20735 if (Op.getOpcode() != ISD::XOR)
20737 if (isOneConstant(Op.getOperand(1)))
20738 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
20739 Op.getOperand(0).hasOneUse();
20743 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
20744 bool addTest = true;
20745 SDValue Chain = Op.getOperand(0);
20746 SDValue Cond = Op.getOperand(1);
20747 SDValue Dest = Op.getOperand(2);
20750 bool Inverted = false;
20752 if (Cond.getOpcode() == ISD::SETCC) {
20753 // Check for setcc([su]{add,sub,mul}o == 0).
20754 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
20755 isNullConstant(Cond.getOperand(1)) &&
20756 Cond.getOperand(0).getResNo() == 1 &&
20757 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
20758 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
20759 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
20760 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
20761 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
20762 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
20764 Cond = Cond.getOperand(0);
20766 if (SDValue NewCond = LowerSETCC(Cond, DAG))
20771 // FIXME: LowerXALUO doesn't handle these!!
20772 else if (Cond.getOpcode() == X86ISD::ADD ||
20773 Cond.getOpcode() == X86ISD::SUB ||
20774 Cond.getOpcode() == X86ISD::SMUL ||
20775 Cond.getOpcode() == X86ISD::UMUL)
20776 Cond = LowerXALUO(Cond, DAG);
20779 // Look pass (and (setcc_carry (cmp ...)), 1).
20780 if (Cond.getOpcode() == ISD::AND &&
20781 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
20782 isOneConstant(Cond.getOperand(1)))
20783 Cond = Cond.getOperand(0);
20785 // If condition flag is set by a X86ISD::CMP, then use it as the condition
20786 // setting operand in place of the X86ISD::SETCC.
20787 unsigned CondOpcode = Cond.getOpcode();
20788 if (CondOpcode == X86ISD::SETCC ||
20789 CondOpcode == X86ISD::SETCC_CARRY) {
20790 CC = Cond.getOperand(0);
20792 SDValue Cmp = Cond.getOperand(1);
20793 unsigned Opc = Cmp.getOpcode();
20794 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
20795 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
20799 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
20803 // These can only come from an arithmetic instruction with overflow,
20804 // e.g. SADDO, UADDO.
20805 Cond = Cond.getOperand(1);
20811 CondOpcode = Cond.getOpcode();
20812 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
20813 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
20814 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
20816 X86::CondCode X86Cond;
20817 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
20820 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
20822 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
20826 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
20827 SDValue Cmp = Cond.getOperand(0).getOperand(1);
20828 if (CondOpc == ISD::OR) {
20829 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
20830 // two branches instead of an explicit OR instruction with a
20832 if (Cmp == Cond.getOperand(1).getOperand(1) &&
20833 isX86LogicalCmp(Cmp)) {
20834 CC = Cond.getOperand(0).getOperand(0);
20835 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
20836 Chain, Dest, CC, Cmp);
20837 CC = Cond.getOperand(1).getOperand(0);
20841 } else { // ISD::AND
20842 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
20843 // two branches instead of an explicit AND instruction with a
20844 // separate test. However, we only do this if this block doesn't
20845 // have a fall-through edge, because this requires an explicit
20846 // jmp when the condition is false.
20847 if (Cmp == Cond.getOperand(1).getOperand(1) &&
20848 isX86LogicalCmp(Cmp) &&
20849 Op.getNode()->hasOneUse()) {
20850 X86::CondCode CCode =
20851 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
20852 CCode = X86::GetOppositeBranchCondition(CCode);
20853 CC = DAG.getConstant(CCode, dl, MVT::i8);
20854 SDNode *User = *Op.getNode()->use_begin();
20855 // Look for an unconditional branch following this conditional branch.
20856 // We need this because we need to reverse the successors in order
20857 // to implement FCMP_OEQ.
20858 if (User->getOpcode() == ISD::BR) {
20859 SDValue FalseBB = User->getOperand(1);
20861 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
20862 assert(NewBR == User);
20866 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
20867 Chain, Dest, CC, Cmp);
20868 X86::CondCode CCode =
20869 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
20870 CCode = X86::GetOppositeBranchCondition(CCode);
20871 CC = DAG.getConstant(CCode, dl, MVT::i8);
20877 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
20878 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
20879 // It should be transformed during dag combiner except when the condition
20880 // is set by a arithmetics with overflow node.
20881 X86::CondCode CCode =
20882 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
20883 CCode = X86::GetOppositeBranchCondition(CCode);
20884 CC = DAG.getConstant(CCode, dl, MVT::i8);
20885 Cond = Cond.getOperand(0).getOperand(1);
20887 } else if (Cond.getOpcode() == ISD::SETCC &&
20888 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
20889 // For FCMP_OEQ, we can emit
20890 // two branches instead of an explicit AND instruction with a
20891 // separate test. However, we only do this if this block doesn't
20892 // have a fall-through edge, because this requires an explicit
20893 // jmp when the condition is false.
20894 if (Op.getNode()->hasOneUse()) {
20895 SDNode *User = *Op.getNode()->use_begin();
20896 // Look for an unconditional branch following this conditional branch.
20897 // We need this because we need to reverse the successors in order
20898 // to implement FCMP_OEQ.
20899 if (User->getOpcode() == ISD::BR) {
20900 SDValue FalseBB = User->getOperand(1);
20902 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
20903 assert(NewBR == User);
20907 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
20908 Cond.getOperand(0), Cond.getOperand(1));
20909 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
20910 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
20911 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
20912 Chain, Dest, CC, Cmp);
20913 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
20918 } else if (Cond.getOpcode() == ISD::SETCC &&
20919 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
20920 // For FCMP_UNE, we can emit
20921 // two branches instead of an explicit OR instruction with a
20923 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
20924 Cond.getOperand(0), Cond.getOperand(1));
20925 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
20926 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
20927 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
20928 Chain, Dest, CC, Cmp);
20929 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
20936 // Look pass the truncate if the high bits are known zero.
20937 if (isTruncWithZeroHighBitsInput(Cond, DAG))
20938 Cond = Cond.getOperand(0);
20940 // We know the result of AND is compared against zero. Try to match
20942 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
20944 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
20953 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
20954 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
20955 Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
20958 Cond = ConvertCmpIfNecessary(Cond, DAG);
20959 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
20960 Chain, Dest, CC, Cond);
20963 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
20964 // Calls to _alloca are needed to probe the stack when allocating more than 4k
20965 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
20966 // that the guard pages used by the OS virtual memory manager are allocated in
20967 // correct sequence.
20969 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
20970 SelectionDAG &DAG) const {
20971 MachineFunction &MF = DAG.getMachineFunction();
20972 bool SplitStack = MF.shouldSplitStack();
20973 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
20974 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
20975 SplitStack || EmitStackProbe;
20979 SDNode *Node = Op.getNode();
20980 SDValue Chain = Op.getOperand(0);
20981 SDValue Size = Op.getOperand(1);
20982 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
20983 EVT VT = Node->getValueType(0);
20985 // Chain the dynamic stack allocation so that it doesn't modify the stack
20986 // pointer when other instructions are using the stack.
20987 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
20989 bool Is64Bit = Subtarget.is64Bit();
20990 MVT SPTy = getPointerTy(DAG.getDataLayout());
20994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20995 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
20996 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
20997 " not tell us which reg is the stack pointer!");
20999 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
21000 Chain = SP.getValue(1);
21001 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21002 unsigned StackAlign = TFI.getStackAlignment();
21003 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
21004 if (Align > StackAlign)
21005 Result = DAG.getNode(ISD::AND, dl, VT, Result,
21006 DAG.getConstant(-(uint64_t)Align, dl, VT));
21007 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
21008 } else if (SplitStack) {
21009 MachineRegisterInfo &MRI = MF.getRegInfo();
21012 // The 64 bit implementation of segmented stacks needs to clobber both r10
21013 // r11. This makes it impossible to use it along with nested parameters.
21014 const Function &F = MF.getFunction();
21015 for (const auto &A : F.args()) {
21016 if (A.hasNestAttr())
21017 report_fatal_error("Cannot use segmented stacks with functions that "
21018 "have nested arguments.");
21022 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
21023 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
21024 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
21025 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
21026 DAG.getRegister(Vreg, SPTy));
21028 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21029 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
21030 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
21032 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21033 unsigned SPReg = RegInfo->getStackRegister();
21034 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
21035 Chain = SP.getValue(1);
21038 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
21039 DAG.getConstant(-(uint64_t)Align, dl, VT));
21040 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
21046 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
21047 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
21049 SDValue Ops[2] = {Result, Chain};
21050 return DAG.getMergeValues(Ops, dl);
21053 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
21054 MachineFunction &MF = DAG.getMachineFunction();
21055 auto PtrVT = getPointerTy(MF.getDataLayout());
21056 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21058 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
21061 if (!Subtarget.is64Bit() ||
21062 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
21063 // vastart just stores the address of the VarArgsFrameIndex slot into the
21064 // memory location argument.
21065 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
21066 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
21067 MachinePointerInfo(SV));
21071 // gp_offset (0 - 6 * 8)
21072 // fp_offset (48 - 48 + 8 * 16)
21073 // overflow_arg_area (point to parameters coming in memory).
21075 SmallVector<SDValue, 8> MemOps;
21076 SDValue FIN = Op.getOperand(1);
21078 SDValue Store = DAG.getStore(
21079 Op.getOperand(0), DL,
21080 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
21081 MachinePointerInfo(SV));
21082 MemOps.push_back(Store);
21085 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
21086 Store = DAG.getStore(
21087 Op.getOperand(0), DL,
21088 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
21089 MachinePointerInfo(SV, 4));
21090 MemOps.push_back(Store);
21092 // Store ptr to overflow_arg_area
21093 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
21094 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
21096 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
21097 MemOps.push_back(Store);
21099 // Store ptr to reg_save_area.
21100 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
21101 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
21102 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
21103 Store = DAG.getStore(
21104 Op.getOperand(0), DL, RSFIN, FIN,
21105 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
21106 MemOps.push_back(Store);
21107 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
21110 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
21111 assert(Subtarget.is64Bit() &&
21112 "LowerVAARG only handles 64-bit va_arg!");
21113 assert(Op.getNumOperands() == 4);
21115 MachineFunction &MF = DAG.getMachineFunction();
21116 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
21117 // The Win64 ABI uses char* instead of a structure.
21118 return DAG.expandVAArg(Op.getNode());
21120 SDValue Chain = Op.getOperand(0);
21121 SDValue SrcPtr = Op.getOperand(1);
21122 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
21123 unsigned Align = Op.getConstantOperandVal(3);
21126 EVT ArgVT = Op.getNode()->getValueType(0);
21127 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21128 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
21131 // Decide which area this value should be read from.
21132 // TODO: Implement the AMD64 ABI in its entirety. This simple
21133 // selection mechanism works only for the basic types.
21134 if (ArgVT == MVT::f80) {
21135 llvm_unreachable("va_arg for f80 not yet implemented");
21136 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
21137 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
21138 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
21139 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
21141 llvm_unreachable("Unhandled argument type in LowerVAARG");
21144 if (ArgMode == 2) {
21145 // Sanity Check: Make sure using fp_offset makes sense.
21146 assert(!Subtarget.useSoftFloat() &&
21147 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
21148 Subtarget.hasSSE1());
21151 // Insert VAARG_64 node into the DAG
21152 // VAARG_64 returns two values: Variable Argument Address, Chain
21153 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
21154 DAG.getConstant(ArgMode, dl, MVT::i8),
21155 DAG.getConstant(Align, dl, MVT::i32)};
21156 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
21157 SDValue VAARG = DAG.getMemIntrinsicNode(
21158 X86ISD::VAARG_64, dl,
21159 VTs, InstOps, MVT::i64,
21160 MachinePointerInfo(SV),
21162 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
21163 Chain = VAARG.getValue(1);
21165 // Load the next argument and return it
21166 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
21169 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
21170 SelectionDAG &DAG) {
21171 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
21172 // where a va_list is still an i8*.
21173 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
21174 if (Subtarget.isCallingConvWin64(
21175 DAG.getMachineFunction().getFunction().getCallingConv()))
21176 // Probably a Win64 va_copy.
21177 return DAG.expandVACopy(Op.getNode());
21179 SDValue Chain = Op.getOperand(0);
21180 SDValue DstPtr = Op.getOperand(1);
21181 SDValue SrcPtr = Op.getOperand(2);
21182 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
21183 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21186 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
21187 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
21189 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
21192 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
21193 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
21197 case X86ISD::VSHLI:
21198 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
21201 case X86ISD::VSRLI:
21202 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
21205 case X86ISD::VSRAI:
21206 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
21208 llvm_unreachable("Unknown target vector shift node");
21211 /// Handle vector element shifts where the shift amount is a constant.
21212 /// Takes immediate version of shift as input.
21213 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
21214 SDValue SrcOp, uint64_t ShiftAmt,
21215 SelectionDAG &DAG) {
21216 MVT ElementType = VT.getVectorElementType();
21218 // Bitcast the source vector to the output type, this is mainly necessary for
21219 // vXi8/vXi64 shifts.
21220 if (VT != SrcOp.getSimpleValueType())
21221 SrcOp = DAG.getBitcast(VT, SrcOp);
21223 // Fold this packed shift into its first operand if ShiftAmt is 0.
21227 // Check for ShiftAmt >= element width
21228 if (ShiftAmt >= ElementType.getSizeInBits()) {
21229 if (Opc == X86ISD::VSRAI)
21230 ShiftAmt = ElementType.getSizeInBits() - 1;
21232 return DAG.getConstant(0, dl, VT);
21235 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
21236 && "Unknown target vector shift-by-constant node");
21238 // Fold this packed vector shift into a build vector if SrcOp is a
21239 // vector of Constants or UNDEFs.
21240 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
21241 SmallVector<SDValue, 8> Elts;
21242 unsigned NumElts = SrcOp->getNumOperands();
21243 ConstantSDNode *ND;
21246 default: llvm_unreachable("Unknown opcode!");
21247 case X86ISD::VSHLI:
21248 for (unsigned i=0; i!=NumElts; ++i) {
21249 SDValue CurrentOp = SrcOp->getOperand(i);
21250 if (CurrentOp->isUndef()) {
21251 Elts.push_back(CurrentOp);
21254 ND = cast<ConstantSDNode>(CurrentOp);
21255 const APInt &C = ND->getAPIntValue();
21256 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
21259 case X86ISD::VSRLI:
21260 for (unsigned i=0; i!=NumElts; ++i) {
21261 SDValue CurrentOp = SrcOp->getOperand(i);
21262 if (CurrentOp->isUndef()) {
21263 Elts.push_back(CurrentOp);
21266 ND = cast<ConstantSDNode>(CurrentOp);
21267 const APInt &C = ND->getAPIntValue();
21268 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
21271 case X86ISD::VSRAI:
21272 for (unsigned i=0; i!=NumElts; ++i) {
21273 SDValue CurrentOp = SrcOp->getOperand(i);
21274 if (CurrentOp->isUndef()) {
21275 Elts.push_back(CurrentOp);
21278 ND = cast<ConstantSDNode>(CurrentOp);
21279 const APInt &C = ND->getAPIntValue();
21280 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
21285 return DAG.getBuildVector(VT, dl, Elts);
21288 return DAG.getNode(Opc, dl, VT, SrcOp,
21289 DAG.getConstant(ShiftAmt, dl, MVT::i8));
21292 /// Handle vector element shifts where the shift amount may or may not be a
21293 /// constant. Takes immediate version of shift as input.
21294 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
21295 SDValue SrcOp, SDValue ShAmt,
21296 const X86Subtarget &Subtarget,
21297 SelectionDAG &DAG) {
21298 MVT SVT = ShAmt.getSimpleValueType();
21299 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
21301 // Catch shift-by-constant.
21302 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
21303 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
21304 CShAmt->getZExtValue(), DAG);
21306 // Change opcode to non-immediate version.
21307 Opc = getTargetVShiftUniformOpcode(Opc, true);
21309 // Need to build a vector containing shift amount.
21310 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
21311 // +====================+============+=======================================+
21312 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
21313 // +====================+============+=======================================+
21314 // | i64 | Yes, No | Use ShAmt as lowest elt |
21315 // | i32 | Yes | zero-extend in-reg |
21316 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
21317 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
21318 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
21319 // +====================+============+=======================================+
21321 if (SVT == MVT::i64)
21322 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
21323 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
21324 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21325 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
21326 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
21327 ShAmt = ShAmt.getOperand(0);
21328 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
21329 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
21330 if (Subtarget.hasSSE41())
21331 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
21332 MVT::v2i64, ShAmt);
21334 SDValue ByteShift = DAG.getConstant(
21335 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
21336 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
21337 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
21339 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
21342 } else if (Subtarget.hasSSE41() &&
21343 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
21344 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
21345 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
21346 MVT::v2i64, ShAmt);
21348 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
21349 DAG.getUNDEF(SVT)};
21350 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
21353 // The return type has to be a 128-bit type with the same element
21354 // type as the input type.
21355 MVT EltVT = VT.getVectorElementType();
21356 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
21358 ShAmt = DAG.getBitcast(ShVT, ShAmt);
21359 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
21362 /// Return Mask with the necessary casting or extending
21363 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
21364 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
21365 const X86Subtarget &Subtarget, SelectionDAG &DAG,
21368 if (isAllOnesConstant(Mask))
21369 return DAG.getConstant(1, dl, MaskVT);
21370 if (X86::isZeroNode(Mask))
21371 return DAG.getConstant(0, dl, MaskVT);
21373 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
21375 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
21376 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
21377 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
21378 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
21380 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
21381 DAG.getConstant(0, dl, MVT::i32));
21382 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
21383 DAG.getConstant(1, dl, MVT::i32));
21385 Lo = DAG.getBitcast(MVT::v32i1, Lo);
21386 Hi = DAG.getBitcast(MVT::v32i1, Hi);
21388 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
21390 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
21391 Mask.getSimpleValueType().getSizeInBits());
21392 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
21393 // are extracted by EXTRACT_SUBVECTOR.
21394 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
21395 DAG.getBitcast(BitcastVT, Mask),
21396 DAG.getIntPtrConstant(0, dl));
21400 /// Return (and \p Op, \p Mask) for compare instructions or
21401 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
21402 /// necessary casting or extending for \p Mask when lowering masking intrinsics
21403 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
21404 SDValue PreservedSrc,
21405 const X86Subtarget &Subtarget,
21406 SelectionDAG &DAG) {
21407 MVT VT = Op.getSimpleValueType();
21408 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21409 unsigned OpcodeSelect = ISD::VSELECT;
21412 if (isAllOnesConstant(Mask))
21415 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21417 if (PreservedSrc.isUndef())
21418 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
21419 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
21422 /// Creates an SDNode for a predicated scalar operation.
21423 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
21424 /// The mask is coming as MVT::i8 and it should be transformed
21425 /// to MVT::v1i1 while lowering masking intrinsics.
21426 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
21427 /// "X86select" instead of "vselect". We just can't create the "vselect" node
21428 /// for a scalar instruction.
21429 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
21430 SDValue PreservedSrc,
21431 const X86Subtarget &Subtarget,
21432 SelectionDAG &DAG) {
21434 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
21435 if (MaskConst->getZExtValue() & 0x1)
21438 MVT VT = Op.getSimpleValueType();
21441 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
21442 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
21443 DAG.getBitcast(MVT::v8i1, Mask),
21444 DAG.getIntPtrConstant(0, dl));
21445 if (Op.getOpcode() == X86ISD::FSETCCM ||
21446 Op.getOpcode() == X86ISD::FSETCCM_RND ||
21447 Op.getOpcode() == X86ISD::VFPCLASSS)
21448 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
21450 if (PreservedSrc.isUndef())
21451 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
21452 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
21455 static int getSEHRegistrationNodeSize(const Function *Fn) {
21456 if (!Fn->hasPersonalityFn())
21457 report_fatal_error(
21458 "querying registration node size for function without personality");
21459 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
21460 // WinEHStatePass for the full struct definition.
21461 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
21462 case EHPersonality::MSVC_X86SEH: return 24;
21463 case EHPersonality::MSVC_CXX: return 16;
21466 report_fatal_error(
21467 "can only recover FP for 32-bit MSVC EH personality functions");
21470 /// When the MSVC runtime transfers control to us, either to an outlined
21471 /// function or when returning to a parent frame after catching an exception, we
21472 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
21473 /// Here's the math:
21474 /// RegNodeBase = EntryEBP - RegNodeSize
21475 /// ParentFP = RegNodeBase - ParentFrameOffset
21476 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
21477 /// subtracting the offset (negative on x86) takes us back to the parent FP.
21478 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
21479 SDValue EntryEBP) {
21480 MachineFunction &MF = DAG.getMachineFunction();
21483 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21484 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
21486 // It's possible that the parent function no longer has a personality function
21487 // if the exceptional code was optimized away, in which case we just return
21488 // the incoming EBP.
21489 if (!Fn->hasPersonalityFn())
21492 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
21493 // registration, or the .set_setframe offset.
21494 MCSymbol *OffsetSym =
21495 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
21496 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
21497 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
21498 SDValue ParentFrameOffset =
21499 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
21501 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
21502 // prologue to RBP in the parent function.
21503 const X86Subtarget &Subtarget =
21504 static_cast<const X86Subtarget &>(DAG.getSubtarget());
21505 if (Subtarget.is64Bit())
21506 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
21508 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
21509 // RegNodeBase = EntryEBP - RegNodeSize
21510 // ParentFP = RegNodeBase - ParentFrameOffset
21511 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
21512 DAG.getConstant(RegNodeSize, dl, PtrVT));
21513 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
21516 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
21517 SelectionDAG &DAG) const {
21518 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
21519 auto isRoundModeCurDirection = [](SDValue Rnd) {
21520 if (!isa<ConstantSDNode>(Rnd))
21523 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
21524 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
21528 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21529 MVT VT = Op.getSimpleValueType();
21530 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
21532 switch(IntrData->Type) {
21533 case INTR_TYPE_1OP: {
21534 // We specify 2 possible opcodes for intrinsics with rounding modes.
21535 // First, we check if the intrinsic may have non-default rounding mode,
21536 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
21537 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21538 if (IntrWithRoundingModeOpcode != 0) {
21539 SDValue Rnd = Op.getOperand(2);
21540 if (!isRoundModeCurDirection(Rnd)) {
21541 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
21542 Op.getOperand(1), Rnd);
21545 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
21547 case INTR_TYPE_2OP: {
21548 SDValue Src2 = Op.getOperand(2);
21550 // We specify 2 possible opcodes for intrinsics with rounding modes.
21551 // First, we check if the intrinsic may have non-default rounding mode,
21552 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
21553 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21554 if (IntrWithRoundingModeOpcode != 0) {
21555 SDValue Rnd = Op.getOperand(3);
21556 if (!isRoundModeCurDirection(Rnd)) {
21557 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
21558 Op.getOperand(1), Src2, Rnd);
21562 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
21563 Op.getOperand(1), Src2);
21565 case INTR_TYPE_3OP:
21566 case INTR_TYPE_3OP_IMM8: {
21567 SDValue Src1 = Op.getOperand(1);
21568 SDValue Src2 = Op.getOperand(2);
21569 SDValue Src3 = Op.getOperand(3);
21571 if (IntrData->Type == INTR_TYPE_3OP_IMM8)
21572 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
21574 // We specify 2 possible opcodes for intrinsics with rounding modes.
21575 // First, we check if the intrinsic may have non-default rounding mode,
21576 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
21577 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21578 if (IntrWithRoundingModeOpcode != 0) {
21579 SDValue Rnd = Op.getOperand(4);
21580 if (!isRoundModeCurDirection(Rnd)) {
21581 return DAG.getNode(IntrWithRoundingModeOpcode,
21582 dl, Op.getValueType(),
21583 Src1, Src2, Src3, Rnd);
21587 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
21590 case INTR_TYPE_4OP:
21591 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
21592 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
21593 case INTR_TYPE_1OP_MASK_RM: {
21594 SDValue Src = Op.getOperand(1);
21595 SDValue PassThru = Op.getOperand(2);
21596 SDValue Mask = Op.getOperand(3);
21597 SDValue RoundingMode;
21598 // We always add rounding mode to the Node.
21599 // If the rounding mode is not specified, we add the
21600 // "current direction" mode.
21601 if (Op.getNumOperands() == 4)
21603 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
21605 RoundingMode = Op.getOperand(4);
21606 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
21607 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
21609 Mask, PassThru, Subtarget, DAG);
21611 case INTR_TYPE_1OP_MASK: {
21612 SDValue Src = Op.getOperand(1);
21613 SDValue PassThru = Op.getOperand(2);
21614 SDValue Mask = Op.getOperand(3);
21615 // We add rounding mode to the Node when
21616 // - RM Opcode is specified and
21617 // - RM is not "current direction".
21618 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21619 if (IntrWithRoundingModeOpcode != 0) {
21620 SDValue Rnd = Op.getOperand(4);
21621 if (!isRoundModeCurDirection(Rnd)) {
21622 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
21623 dl, Op.getValueType(),
21625 Mask, PassThru, Subtarget, DAG);
21628 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
21629 Mask, PassThru, Subtarget, DAG);
21631 case INTR_TYPE_SCALAR_MASK: {
21632 SDValue Src1 = Op.getOperand(1);
21633 SDValue Src2 = Op.getOperand(2);
21634 SDValue passThru = Op.getOperand(3);
21635 SDValue Mask = Op.getOperand(4);
21636 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21637 // There are 2 kinds of intrinsics in this group:
21638 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
21639 // (2) With rounding mode and sae - 7 operands.
21640 bool HasRounding = IntrWithRoundingModeOpcode != 0;
21641 if (Op.getNumOperands() == (5U + HasRounding)) {
21643 SDValue Rnd = Op.getOperand(5);
21644 if (!isRoundModeCurDirection(Rnd))
21645 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
21646 dl, VT, Src1, Src2, Rnd),
21647 Mask, passThru, Subtarget, DAG);
21649 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
21651 Mask, passThru, Subtarget, DAG);
21654 assert(Op.getNumOperands() == (6U + HasRounding) &&
21655 "Unexpected intrinsic form");
21656 SDValue RoundingMode = Op.getOperand(5);
21658 SDValue Sae = Op.getOperand(6);
21659 if (!isRoundModeCurDirection(Sae))
21660 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
21661 dl, VT, Src1, Src2,
21662 RoundingMode, Sae),
21663 Mask, passThru, Subtarget, DAG);
21665 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
21666 Src2, RoundingMode),
21667 Mask, passThru, Subtarget, DAG);
21669 case INTR_TYPE_SCALAR_MASK_RM: {
21670 SDValue Src1 = Op.getOperand(1);
21671 SDValue Src2 = Op.getOperand(2);
21672 SDValue Src0 = Op.getOperand(3);
21673 SDValue Mask = Op.getOperand(4);
21674 // There are 2 kinds of intrinsics in this group:
21675 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
21676 // (2) With rounding mode and sae - 7 operands.
21677 if (Op.getNumOperands() == 6) {
21678 SDValue Sae = Op.getOperand(5);
21679 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
21681 Mask, Src0, Subtarget, DAG);
21683 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
21684 SDValue RoundingMode = Op.getOperand(5);
21685 SDValue Sae = Op.getOperand(6);
21686 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
21687 RoundingMode, Sae),
21688 Mask, Src0, Subtarget, DAG);
21690 case INTR_TYPE_2OP_MASK: {
21691 SDValue Src1 = Op.getOperand(1);
21692 SDValue Src2 = Op.getOperand(2);
21693 SDValue PassThru = Op.getOperand(3);
21694 SDValue Mask = Op.getOperand(4);
21696 // We specify 2 possible opcodes for intrinsics with rounding modes.
21697 // First, we check if the intrinsic may have non-default rounding mode,
21698 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
21699 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21700 if (IntrWithRoundingModeOpcode != 0) {
21701 SDValue Rnd = Op.getOperand(5);
21702 if (!isRoundModeCurDirection(Rnd)) {
21703 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
21704 dl, Op.getValueType(),
21706 Mask, PassThru, Subtarget, DAG);
21709 // TODO: Intrinsics should have fast-math-flags to propagate.
21710 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
21711 Mask, PassThru, Subtarget, DAG);
21713 case INTR_TYPE_2OP_MASK_RM: {
21714 SDValue Src1 = Op.getOperand(1);
21715 SDValue Src2 = Op.getOperand(2);
21716 SDValue PassThru = Op.getOperand(3);
21717 SDValue Mask = Op.getOperand(4);
21718 // We specify 2 possible modes for intrinsics, with/without rounding
21720 // First, we check if the intrinsic have rounding mode (6 operands),
21721 // if not, we set rounding mode to "current".
21723 if (Op.getNumOperands() == 6)
21724 Rnd = Op.getOperand(5);
21726 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
21727 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
21729 Mask, PassThru, Subtarget, DAG);
21731 case INTR_TYPE_3OP_SCALAR_MASK: {
21732 SDValue Src1 = Op.getOperand(1);
21733 SDValue Src2 = Op.getOperand(2);
21734 SDValue Src3 = Op.getOperand(3);
21735 SDValue PassThru = Op.getOperand(4);
21736 SDValue Mask = Op.getOperand(5);
21738 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21739 if (IntrWithRoundingModeOpcode != 0) {
21740 SDValue Rnd = Op.getOperand(6);
21741 if (!isRoundModeCurDirection(Rnd))
21742 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
21743 dl, VT, Src1, Src2, Src3, Rnd),
21744 Mask, PassThru, Subtarget, DAG);
21746 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
21748 Mask, PassThru, Subtarget, DAG);
21750 case INTR_TYPE_3OP_MASK: {
21751 SDValue Src1 = Op.getOperand(1);
21752 SDValue Src2 = Op.getOperand(2);
21753 SDValue Src3 = Op.getOperand(3);
21754 SDValue PassThru = Op.getOperand(4);
21755 SDValue Mask = Op.getOperand(5);
21757 // We specify 2 possible opcodes for intrinsics with rounding modes.
21758 // First, we check if the intrinsic may have non-default rounding mode,
21759 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
21760 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21761 if (IntrWithRoundingModeOpcode != 0) {
21762 SDValue Rnd = Op.getOperand(6);
21763 if (!isRoundModeCurDirection(Rnd)) {
21764 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
21765 dl, Op.getValueType(),
21766 Src1, Src2, Src3, Rnd),
21767 Mask, PassThru, Subtarget, DAG);
21770 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
21772 Mask, PassThru, Subtarget, DAG);
21775 SDValue Src1 = Op.getOperand(1);
21776 SDValue Src2 = Op.getOperand(2);
21778 // Swap Src1 and Src2 in the node creation
21779 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
21782 // NOTE: We need to swizzle the operands to pass the multiply operands
21784 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
21785 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
21787 // ISD::FP_ROUND has a second argument that indicates if the truncation
21788 // does not change the value. Set it to 0 since it can change.
21789 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
21790 DAG.getIntPtrConstant(0, dl));
21791 case CVTPD2PS_RND_MASK: {
21792 SDValue Src = Op.getOperand(1);
21793 SDValue PassThru = Op.getOperand(2);
21794 SDValue Mask = Op.getOperand(3);
21795 // We add rounding mode to the Node when
21796 // - RM Opcode is specified and
21797 // - RM is not "current direction".
21798 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
21799 if (IntrWithRoundingModeOpcode != 0) {
21800 SDValue Rnd = Op.getOperand(4);
21801 if (!isRoundModeCurDirection(Rnd)) {
21802 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
21803 dl, Op.getValueType(),
21805 Mask, PassThru, Subtarget, DAG);
21808 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
21809 // ISD::FP_ROUND has a second argument that indicates if the truncation
21810 // does not change the value. Set it to 0 since it can change.
21811 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
21812 DAG.getIntPtrConstant(0, dl)),
21813 Mask, PassThru, Subtarget, DAG);
21816 SDValue Src1 = Op.getOperand(1);
21817 SDValue Imm = Op.getOperand(2);
21818 SDValue Mask = Op.getOperand(3);
21819 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
21820 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
21822 // Need to fill with zeros to ensure the bitcast will produce zeroes
21823 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
21824 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
21825 DAG.getConstant(0, dl, MVT::v8i1),
21826 FPclassMask, DAG.getIntPtrConstant(0, dl));
21827 return DAG.getBitcast(MVT::i8, Ins);
21830 case CMP_MASK_CC: {
21831 MVT MaskVT = Op.getSimpleValueType();
21833 SDValue CC = Op.getOperand(3);
21834 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
21835 // We specify 2 possible opcodes for intrinsics with rounding modes.
21836 // First, we check if the intrinsic may have non-default rounding mode,
21837 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
21838 if (IntrData->Opc1 != 0) {
21839 SDValue Rnd = Op.getOperand(4);
21840 if (!isRoundModeCurDirection(Rnd))
21841 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
21842 Op.getOperand(2), CC, Rnd);
21844 //default rounding mode
21845 if (!Cmp.getNode())
21846 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
21847 Op.getOperand(2), CC);
21851 case CMP_MASK_SCALAR_CC: {
21852 SDValue Src1 = Op.getOperand(1);
21853 SDValue Src2 = Op.getOperand(2);
21854 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
21855 SDValue Mask = Op.getOperand(4);
21858 if (IntrData->Opc1 != 0) {
21859 SDValue Rnd = Op.getOperand(5);
21860 if (!isRoundModeCurDirection(Rnd))
21861 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
21863 //default rounding mode
21865 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
21867 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
21869 // Need to fill with zeros to ensure the bitcast will produce zeroes
21870 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
21871 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
21872 DAG.getConstant(0, dl, MVT::v8i1),
21873 CmpMask, DAG.getIntPtrConstant(0, dl));
21874 return DAG.getBitcast(MVT::i8, Ins);
21876 case COMI: { // Comparison intrinsics
21877 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
21878 SDValue LHS = Op.getOperand(1);
21879 SDValue RHS = Op.getOperand(2);
21880 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
21881 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
21884 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
21885 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
21886 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
21887 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
21890 case ISD::SETNE: { // (ZF = 1 or PF = 1)
21891 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
21892 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
21893 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
21896 case ISD::SETGT: // (CF = 0 and ZF = 0)
21897 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
21899 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
21900 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
21903 case ISD::SETGE: // CF = 0
21904 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
21906 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
21907 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
21910 llvm_unreachable("Unexpected illegal condition!");
21912 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21914 case COMI_RM: { // Comparison intrinsics with Sae
21915 SDValue LHS = Op.getOperand(1);
21916 SDValue RHS = Op.getOperand(2);
21917 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
21918 SDValue Sae = Op.getOperand(4);
21921 if (isRoundModeCurDirection(Sae))
21922 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
21923 DAG.getConstant(CondVal, dl, MVT::i8));
21925 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
21926 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
21927 // Need to fill with zeros to ensure the bitcast will produce zeroes
21928 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
21929 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
21930 DAG.getConstant(0, dl, MVT::v16i1),
21931 FCmp, DAG.getIntPtrConstant(0, dl));
21932 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
21933 DAG.getBitcast(MVT::i16, Ins));
21936 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
21937 Op.getOperand(1), Op.getOperand(2), Subtarget,
21939 case COMPRESS_EXPAND_IN_REG: {
21940 SDValue Mask = Op.getOperand(3);
21941 SDValue DataToCompress = Op.getOperand(1);
21942 SDValue PassThru = Op.getOperand(2);
21943 if (isAllOnesConstant(Mask)) // return data as is
21944 return Op.getOperand(1);
21946 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
21948 Mask, PassThru, Subtarget, DAG);
21951 case FIXUPIMMS_MASKZ:
21953 case FIXUPIMM_MASKZ:{
21954 SDValue Src1 = Op.getOperand(1);
21955 SDValue Src2 = Op.getOperand(2);
21956 SDValue Src3 = Op.getOperand(3);
21957 SDValue Imm = Op.getOperand(4);
21958 SDValue Mask = Op.getOperand(5);
21959 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
21960 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
21961 // We specify 2 possible modes for intrinsics, with/without rounding
21963 // First, we check if the intrinsic have rounding mode (7 operands),
21964 // if not, we set rounding mode to "current".
21966 if (Op.getNumOperands() == 7)
21967 Rnd = Op.getOperand(6);
21969 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
21970 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
21971 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
21972 Src1, Src2, Src3, Imm, Rnd),
21973 Mask, Passthru, Subtarget, DAG);
21974 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
21975 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
21976 Src1, Src2, Src3, Imm, Rnd),
21977 Mask, Passthru, Subtarget, DAG);
21980 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
21981 // Clear the upper bits of the rounding immediate so that the legacy
21982 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
21983 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
21985 DAG.getConstant(0xf, dl, MVT::i32));
21986 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
21987 Op.getOperand(1), RoundingMode);
21990 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
21991 // Clear the upper bits of the rounding immediate so that the legacy
21992 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
21993 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
21995 DAG.getConstant(0xf, dl, MVT::i32));
21996 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
21997 Op.getOperand(1), Op.getOperand(2), RoundingMode);
22001 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
22002 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
22005 // If the carry in is zero, then we should just use ADD/SUB instead of
22007 if (isNullConstant(Op.getOperand(1))) {
22008 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
22011 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
22012 DAG.getConstant(-1, dl, MVT::i8));
22013 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
22014 Op.getOperand(3), GenCF.getValue(1));
22016 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
22017 SDValue Results[] = { SetCC, Res };
22018 return DAG.getMergeValues(Results, dl);
22020 case CVTPD2PS_MASK:
22022 case TRUNCATE_TO_REG: {
22023 SDValue Src = Op.getOperand(1);
22024 SDValue PassThru = Op.getOperand(2);
22025 SDValue Mask = Op.getOperand(3);
22027 if (isAllOnesConstant(Mask))
22028 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
22030 MVT SrcVT = Src.getSimpleValueType();
22031 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
22032 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22033 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
22036 case CVTPS2PH_MASK: {
22037 SDValue Src = Op.getOperand(1);
22038 SDValue Rnd = Op.getOperand(2);
22039 SDValue PassThru = Op.getOperand(3);
22040 SDValue Mask = Op.getOperand(4);
22042 if (isAllOnesConstant(Mask))
22043 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
22045 MVT SrcVT = Src.getSimpleValueType();
22046 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
22047 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22048 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
22058 default: return SDValue(); // Don't custom lower most intrinsics.
22060 // ptest and testp intrinsics. The intrinsic these come from are designed to
22061 // return an integer value, not just an instruction so lower it to the ptest
22062 // or testp pattern and a setcc for the result.
22063 case Intrinsic::x86_avx512_ktestc_b:
22064 case Intrinsic::x86_avx512_ktestc_w:
22065 case Intrinsic::x86_avx512_ktestc_d:
22066 case Intrinsic::x86_avx512_ktestc_q:
22067 case Intrinsic::x86_avx512_ktestz_b:
22068 case Intrinsic::x86_avx512_ktestz_w:
22069 case Intrinsic::x86_avx512_ktestz_d:
22070 case Intrinsic::x86_avx512_ktestz_q:
22071 case Intrinsic::x86_sse41_ptestz:
22072 case Intrinsic::x86_sse41_ptestc:
22073 case Intrinsic::x86_sse41_ptestnzc:
22074 case Intrinsic::x86_avx_ptestz_256:
22075 case Intrinsic::x86_avx_ptestc_256:
22076 case Intrinsic::x86_avx_ptestnzc_256:
22077 case Intrinsic::x86_avx_vtestz_ps:
22078 case Intrinsic::x86_avx_vtestc_ps:
22079 case Intrinsic::x86_avx_vtestnzc_ps:
22080 case Intrinsic::x86_avx_vtestz_pd:
22081 case Intrinsic::x86_avx_vtestc_pd:
22082 case Intrinsic::x86_avx_vtestnzc_pd:
22083 case Intrinsic::x86_avx_vtestz_ps_256:
22084 case Intrinsic::x86_avx_vtestc_ps_256:
22085 case Intrinsic::x86_avx_vtestnzc_ps_256:
22086 case Intrinsic::x86_avx_vtestz_pd_256:
22087 case Intrinsic::x86_avx_vtestc_pd_256:
22088 case Intrinsic::x86_avx_vtestnzc_pd_256: {
22089 unsigned TestOpc = X86ISD::PTEST;
22090 X86::CondCode X86CC;
22092 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
22093 case Intrinsic::x86_avx512_ktestc_b:
22094 case Intrinsic::x86_avx512_ktestc_w:
22095 case Intrinsic::x86_avx512_ktestc_d:
22096 case Intrinsic::x86_avx512_ktestc_q:
22098 TestOpc = X86ISD::KTEST;
22099 X86CC = X86::COND_B;
22101 case Intrinsic::x86_avx512_ktestz_b:
22102 case Intrinsic::x86_avx512_ktestz_w:
22103 case Intrinsic::x86_avx512_ktestz_d:
22104 case Intrinsic::x86_avx512_ktestz_q:
22105 TestOpc = X86ISD::KTEST;
22106 X86CC = X86::COND_E;
22108 case Intrinsic::x86_avx_vtestz_ps:
22109 case Intrinsic::x86_avx_vtestz_pd:
22110 case Intrinsic::x86_avx_vtestz_ps_256:
22111 case Intrinsic::x86_avx_vtestz_pd_256:
22112 TestOpc = X86ISD::TESTP;
22114 case Intrinsic::x86_sse41_ptestz:
22115 case Intrinsic::x86_avx_ptestz_256:
22117 X86CC = X86::COND_E;
22119 case Intrinsic::x86_avx_vtestc_ps:
22120 case Intrinsic::x86_avx_vtestc_pd:
22121 case Intrinsic::x86_avx_vtestc_ps_256:
22122 case Intrinsic::x86_avx_vtestc_pd_256:
22123 TestOpc = X86ISD::TESTP;
22125 case Intrinsic::x86_sse41_ptestc:
22126 case Intrinsic::x86_avx_ptestc_256:
22128 X86CC = X86::COND_B;
22130 case Intrinsic::x86_avx_vtestnzc_ps:
22131 case Intrinsic::x86_avx_vtestnzc_pd:
22132 case Intrinsic::x86_avx_vtestnzc_ps_256:
22133 case Intrinsic::x86_avx_vtestnzc_pd_256:
22134 TestOpc = X86ISD::TESTP;
22136 case Intrinsic::x86_sse41_ptestnzc:
22137 case Intrinsic::x86_avx_ptestnzc_256:
22139 X86CC = X86::COND_A;
22143 SDValue LHS = Op.getOperand(1);
22144 SDValue RHS = Op.getOperand(2);
22145 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
22146 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
22147 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
22150 case Intrinsic::x86_sse42_pcmpistria128:
22151 case Intrinsic::x86_sse42_pcmpestria128:
22152 case Intrinsic::x86_sse42_pcmpistric128:
22153 case Intrinsic::x86_sse42_pcmpestric128:
22154 case Intrinsic::x86_sse42_pcmpistrio128:
22155 case Intrinsic::x86_sse42_pcmpestrio128:
22156 case Intrinsic::x86_sse42_pcmpistris128:
22157 case Intrinsic::x86_sse42_pcmpestris128:
22158 case Intrinsic::x86_sse42_pcmpistriz128:
22159 case Intrinsic::x86_sse42_pcmpestriz128: {
22161 X86::CondCode X86CC;
22163 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
22164 case Intrinsic::x86_sse42_pcmpistria128:
22165 Opcode = X86ISD::PCMPISTR;
22166 X86CC = X86::COND_A;
22168 case Intrinsic::x86_sse42_pcmpestria128:
22169 Opcode = X86ISD::PCMPESTR;
22170 X86CC = X86::COND_A;
22172 case Intrinsic::x86_sse42_pcmpistric128:
22173 Opcode = X86ISD::PCMPISTR;
22174 X86CC = X86::COND_B;
22176 case Intrinsic::x86_sse42_pcmpestric128:
22177 Opcode = X86ISD::PCMPESTR;
22178 X86CC = X86::COND_B;
22180 case Intrinsic::x86_sse42_pcmpistrio128:
22181 Opcode = X86ISD::PCMPISTR;
22182 X86CC = X86::COND_O;
22184 case Intrinsic::x86_sse42_pcmpestrio128:
22185 Opcode = X86ISD::PCMPESTR;
22186 X86CC = X86::COND_O;
22188 case Intrinsic::x86_sse42_pcmpistris128:
22189 Opcode = X86ISD::PCMPISTR;
22190 X86CC = X86::COND_S;
22192 case Intrinsic::x86_sse42_pcmpestris128:
22193 Opcode = X86ISD::PCMPESTR;
22194 X86CC = X86::COND_S;
22196 case Intrinsic::x86_sse42_pcmpistriz128:
22197 Opcode = X86ISD::PCMPISTR;
22198 X86CC = X86::COND_E;
22200 case Intrinsic::x86_sse42_pcmpestriz128:
22201 Opcode = X86ISD::PCMPESTR;
22202 X86CC = X86::COND_E;
22205 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
22206 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
22207 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
22208 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
22209 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
22212 case Intrinsic::x86_sse42_pcmpistri128:
22213 case Intrinsic::x86_sse42_pcmpestri128: {
22215 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
22216 Opcode = X86ISD::PCMPISTR;
22218 Opcode = X86ISD::PCMPESTR;
22220 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
22221 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
22222 return DAG.getNode(Opcode, dl, VTs, NewOps);
22225 case Intrinsic::x86_sse42_pcmpistrm128:
22226 case Intrinsic::x86_sse42_pcmpestrm128: {
22228 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
22229 Opcode = X86ISD::PCMPISTR;
22231 Opcode = X86ISD::PCMPESTR;
22233 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
22234 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
22235 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
22238 case Intrinsic::eh_sjlj_lsda: {
22239 MachineFunction &MF = DAG.getMachineFunction();
22240 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22241 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
22242 auto &Context = MF.getMMI().getContext();
22243 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
22244 Twine(MF.getFunctionNumber()));
22245 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
22246 DAG.getMCSymbol(S, PtrVT));
22249 case Intrinsic::x86_seh_lsda: {
22250 // Compute the symbol for the LSDA. We know it'll get emitted later.
22251 MachineFunction &MF = DAG.getMachineFunction();
22252 SDValue Op1 = Op.getOperand(1);
22253 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
22254 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
22255 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
22257 // Generate a simple absolute symbol reference. This intrinsic is only
22258 // supported on 32-bit Windows, which isn't PIC.
22259 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
22260 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
22263 case Intrinsic::eh_recoverfp: {
22264 SDValue FnOp = Op.getOperand(1);
22265 SDValue IncomingFPOp = Op.getOperand(2);
22266 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
22267 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
22269 report_fatal_error(
22270 "llvm.eh.recoverfp must take a function as the first argument");
22271 return recoverFramePointer(DAG, Fn, IncomingFPOp);
22274 case Intrinsic::localaddress: {
22275 // Returns one of the stack, base, or frame pointer registers, depending on
22276 // which is used to reference local variables.
22277 MachineFunction &MF = DAG.getMachineFunction();
22278 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
22280 if (RegInfo->hasBasePointer(MF))
22281 Reg = RegInfo->getBaseRegister();
22282 else // This function handles the SP or FP case.
22283 Reg = RegInfo->getPtrSizedFrameRegister(MF);
22284 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
22289 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
22290 SDValue Src, SDValue Mask, SDValue Base,
22291 SDValue Index, SDValue ScaleOp, SDValue Chain,
22292 const X86Subtarget &Subtarget) {
22294 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
22295 // Scale must be constant.
22298 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
22299 EVT MaskVT = Mask.getValueType();
22300 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
22301 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
22302 SDValue Segment = DAG.getRegister(0, MVT::i32);
22303 // If source is undef or we know it won't be used, use a zero vector
22304 // to break register dependency.
22305 // TODO: use undef instead and let BreakFalseDeps deal with it?
22306 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
22307 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
22308 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
22309 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
22310 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
22311 return DAG.getMergeValues(RetOps, dl);
22314 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
22315 SDValue Src, SDValue Mask, SDValue Base,
22316 SDValue Index, SDValue ScaleOp, SDValue Chain,
22317 const X86Subtarget &Subtarget) {
22318 MVT VT = Op.getSimpleValueType();
22320 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
22321 // Scale must be constant.
22324 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
22325 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
22326 VT.getVectorNumElements());
22327 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
22329 // We support two versions of the gather intrinsics. One with scalar mask and
22330 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
22331 if (Mask.getValueType() != MaskVT)
22332 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22334 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
22335 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
22336 SDValue Segment = DAG.getRegister(0, MVT::i32);
22337 // If source is undef or we know it won't be used, use a zero vector
22338 // to break register dependency.
22339 // TODO: use undef instead and let BreakFalseDeps deal with it?
22340 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
22341 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
22342 SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain};
22343 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
22344 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
22345 return DAG.getMergeValues(RetOps, dl);
22348 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
22349 SDValue Src, SDValue Mask, SDValue Base,
22350 SDValue Index, SDValue ScaleOp, SDValue Chain,
22351 const X86Subtarget &Subtarget) {
22353 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
22354 // Scale must be constant.
22357 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
22358 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
22359 SDValue Segment = DAG.getRegister(0, MVT::i32);
22360 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
22361 Src.getSimpleValueType().getVectorNumElements());
22362 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
22364 // We support two versions of the scatter intrinsics. One with scalar mask and
22365 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
22366 if (Mask.getValueType() != MaskVT)
22367 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22369 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
22370 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain};
22371 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
22372 return SDValue(Res, 1);
22375 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
22376 SDValue Mask, SDValue Base, SDValue Index,
22377 SDValue ScaleOp, SDValue Chain,
22378 const X86Subtarget &Subtarget) {
22380 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
22381 // Scale must be constant.
22384 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
22385 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
22386 SDValue Segment = DAG.getRegister(0, MVT::i32);
22388 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
22389 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22390 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
22391 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
22392 return SDValue(Res, 0);
22395 /// Handles the lowering of builtin intrinsic that return the value
22396 /// of the extended control register.
22397 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
22399 const X86Subtarget &Subtarget,
22400 SmallVectorImpl<SDValue> &Results) {
22401 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
22402 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22405 // The ECX register is used to select the index of the XCR register to
22408 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
22409 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
22410 Chain = SDValue(N1, 0);
22412 // Reads the content of XCR and returns it in registers EDX:EAX.
22413 if (Subtarget.is64Bit()) {
22414 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
22415 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
22418 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
22419 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
22422 Chain = HI.getValue(1);
22424 if (Subtarget.is64Bit()) {
22425 // Merge the two 32-bit values into a 64-bit one..
22426 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
22427 DAG.getConstant(32, DL, MVT::i8));
22428 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
22429 Results.push_back(Chain);
22433 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
22434 SDValue Ops[] = { LO, HI };
22435 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
22436 Results.push_back(Pair);
22437 Results.push_back(Chain);
22440 /// Handles the lowering of builtin intrinsics that read performance monitor
22441 /// counters (x86_rdpmc).
22442 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
22444 const X86Subtarget &Subtarget,
22445 SmallVectorImpl<SDValue> &Results) {
22446 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
22447 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22450 // The ECX register is used to select the index of the performance counter
22452 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
22454 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
22456 // Reads the content of a 64-bit performance counter and returns it in the
22457 // registers EDX:EAX.
22458 if (Subtarget.is64Bit()) {
22459 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
22460 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
22463 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
22464 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
22467 Chain = HI.getValue(1);
22469 if (Subtarget.is64Bit()) {
22470 // The EAX register is loaded with the low-order 32 bits. The EDX register
22471 // is loaded with the supported high-order bits of the counter.
22472 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
22473 DAG.getConstant(32, DL, MVT::i8));
22474 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
22475 Results.push_back(Chain);
22479 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
22480 SDValue Ops[] = { LO, HI };
22481 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
22482 Results.push_back(Pair);
22483 Results.push_back(Chain);
22486 /// Handles the lowering of builtin intrinsics that read the time stamp counter
22487 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
22488 /// READCYCLECOUNTER nodes.
22489 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
22491 const X86Subtarget &Subtarget,
22492 SmallVectorImpl<SDValue> &Results) {
22493 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22494 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
22497 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
22498 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
22499 // and the EAX register is loaded with the low-order 32 bits.
22500 if (Subtarget.is64Bit()) {
22501 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
22502 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
22505 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
22506 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
22509 SDValue Chain = HI.getValue(1);
22512 if (Subtarget.is64Bit()) {
22513 // The EDX register is loaded with the high-order 32 bits of the MSR, and
22514 // the EAX register is loaded with the low-order 32 bits.
22515 TSC = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
22516 DAG.getConstant(32, DL, MVT::i8));
22517 TSC = DAG.getNode(ISD::OR, DL, MVT::i64, LO, TSC);
22519 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
22520 TSC = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, { LO, HI });
22523 if (Opcode == X86ISD::RDTSCP_DAG) {
22524 assert(N->getNumOperands() == 2 && "Unexpected number of operands!");
22526 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
22527 // the ECX register. Add 'ecx' explicitly to the chain.
22528 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
22531 Results.push_back(TSC);
22532 Results.push_back(ecx);
22533 Results.push_back(ecx.getValue(1));
22537 Results.push_back(TSC);
22538 Results.push_back(Chain);
22541 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
22542 SelectionDAG &DAG) {
22543 SmallVector<SDValue, 3> Results;
22545 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
22547 return DAG.getMergeValues(Results, DL);
22550 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
22551 MachineFunction &MF = DAG.getMachineFunction();
22552 SDValue Chain = Op.getOperand(0);
22553 SDValue RegNode = Op.getOperand(2);
22554 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
22556 report_fatal_error("EH registrations only live in functions using WinEH");
22558 // Cast the operand to an alloca, and remember the frame index.
22559 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
22561 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
22562 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
22564 // Return the chain operand without making any DAG nodes.
22568 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
22569 MachineFunction &MF = DAG.getMachineFunction();
22570 SDValue Chain = Op.getOperand(0);
22571 SDValue EHGuard = Op.getOperand(2);
22572 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
22574 report_fatal_error("EHGuard only live in functions using WinEH");
22576 // Cast the operand to an alloca, and remember the frame index.
22577 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
22579 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
22580 EHInfo->EHGuardFrameIndex = FINode->getIndex();
22582 // Return the chain operand without making any DAG nodes.
22586 /// Emit Truncating Store with signed or unsigned saturation.
22588 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
22589 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
22590 SelectionDAG &DAG) {
22592 SDVTList VTs = DAG.getVTList(MVT::Other);
22593 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
22594 SDValue Ops[] = { Chain, Val, Ptr, Undef };
22596 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
22597 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
22600 /// Emit Masked Truncating Store with signed or unsigned saturation.
22602 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
22603 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
22604 MachineMemOperand *MMO, SelectionDAG &DAG) {
22606 SDVTList VTs = DAG.getVTList(MVT::Other);
22607 SDValue Ops[] = { Chain, Val, Ptr, Mask };
22609 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
22610 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
22613 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
22614 SelectionDAG &DAG) {
22615 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
22617 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
22620 case llvm::Intrinsic::x86_seh_ehregnode:
22621 return MarkEHRegistrationNode(Op, DAG);
22622 case llvm::Intrinsic::x86_seh_ehguard:
22623 return MarkEHGuard(Op, DAG);
22624 case llvm::Intrinsic::x86_flags_read_u32:
22625 case llvm::Intrinsic::x86_flags_read_u64:
22626 case llvm::Intrinsic::x86_flags_write_u32:
22627 case llvm::Intrinsic::x86_flags_write_u64: {
22628 // We need a frame pointer because this will get lowered to a PUSH/POP
22630 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
22631 MFI.setHasCopyImplyingStackAdjustment(true);
22632 // Don't do anything here, we will expand these intrinsics out later
22633 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
22636 case Intrinsic::x86_lwpins32:
22637 case Intrinsic::x86_lwpins64:
22638 case Intrinsic::x86_umwait:
22639 case Intrinsic::x86_tpause: {
22641 SDValue Chain = Op->getOperand(0);
22642 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
22646 default: llvm_unreachable("Impossible intrinsic");
22647 case Intrinsic::x86_umwait:
22648 Opcode = X86ISD::UMWAIT;
22650 case Intrinsic::x86_tpause:
22651 Opcode = X86ISD::TPAUSE;
22653 case Intrinsic::x86_lwpins32:
22654 case Intrinsic::x86_lwpins64:
22655 Opcode = X86ISD::LWPINS;
22659 SDValue Operation =
22660 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
22661 Op->getOperand(3), Op->getOperand(4));
22662 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
22663 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
22664 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
22665 Operation.getValue(1));
22672 switch(IntrData->Type) {
22673 default: llvm_unreachable("Unknown Intrinsic Type");
22676 // Emit the node with the right value type.
22677 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
22678 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
22680 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
22681 // Otherwise return the value from Rand, which is always 0, casted to i32.
22682 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
22683 DAG.getConstant(1, dl, Op->getValueType(1)),
22684 DAG.getConstant(X86::COND_B, dl, MVT::i8),
22685 SDValue(Result.getNode(), 1) };
22686 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
22688 // Return { result, isValid, chain }.
22689 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
22690 SDValue(Result.getNode(), 2));
22692 case GATHER_AVX2: {
22693 SDValue Chain = Op.getOperand(0);
22694 SDValue Src = Op.getOperand(2);
22695 SDValue Base = Op.getOperand(3);
22696 SDValue Index = Op.getOperand(4);
22697 SDValue Mask = Op.getOperand(5);
22698 SDValue Scale = Op.getOperand(6);
22699 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
22700 Scale, Chain, Subtarget);
22703 //gather(v1, mask, index, base, scale);
22704 SDValue Chain = Op.getOperand(0);
22705 SDValue Src = Op.getOperand(2);
22706 SDValue Base = Op.getOperand(3);
22707 SDValue Index = Op.getOperand(4);
22708 SDValue Mask = Op.getOperand(5);
22709 SDValue Scale = Op.getOperand(6);
22710 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
22714 //scatter(base, mask, index, v1, scale);
22715 SDValue Chain = Op.getOperand(0);
22716 SDValue Base = Op.getOperand(2);
22717 SDValue Mask = Op.getOperand(3);
22718 SDValue Index = Op.getOperand(4);
22719 SDValue Src = Op.getOperand(5);
22720 SDValue Scale = Op.getOperand(6);
22721 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
22722 Scale, Chain, Subtarget);
22725 SDValue Hint = Op.getOperand(6);
22726 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
22727 assert((HintVal == 2 || HintVal == 3) &&
22728 "Wrong prefetch hint in intrinsic: should be 2 or 3");
22729 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
22730 SDValue Chain = Op.getOperand(0);
22731 SDValue Mask = Op.getOperand(2);
22732 SDValue Index = Op.getOperand(3);
22733 SDValue Base = Op.getOperand(4);
22734 SDValue Scale = Op.getOperand(5);
22735 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
22738 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
22740 SmallVector<SDValue, 2> Results;
22741 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
22743 return DAG.getMergeValues(Results, dl);
22745 // Read Performance Monitoring Counters.
22747 SmallVector<SDValue, 2> Results;
22748 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
22749 return DAG.getMergeValues(Results, dl);
22751 // Get Extended Control Register.
22753 SmallVector<SDValue, 2> Results;
22754 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
22755 return DAG.getMergeValues(Results, dl);
22757 // XTEST intrinsics.
22759 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
22760 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
22762 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
22763 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
22764 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
22765 Ret, SDValue(InTrans.getNode(), 1));
22767 case TRUNCATE_TO_MEM_VI8:
22768 case TRUNCATE_TO_MEM_VI16:
22769 case TRUNCATE_TO_MEM_VI32: {
22770 SDValue Mask = Op.getOperand(4);
22771 SDValue DataToTruncate = Op.getOperand(3);
22772 SDValue Addr = Op.getOperand(2);
22773 SDValue Chain = Op.getOperand(0);
22775 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
22776 assert(MemIntr && "Expected MemIntrinsicSDNode!");
22778 EVT MemVT = MemIntr->getMemoryVT();
22780 uint16_t TruncationOp = IntrData->Opc0;
22781 switch (TruncationOp) {
22782 case X86ISD::VTRUNC: {
22783 if (isAllOnesConstant(Mask)) // return just a truncate store
22784 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
22785 MemIntr->getMemOperand());
22787 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
22788 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22790 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
22791 MemIntr->getMemOperand(), true /* truncating */);
22793 case X86ISD::VTRUNCUS:
22794 case X86ISD::VTRUNCS: {
22795 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
22796 if (isAllOnesConstant(Mask))
22797 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
22798 MemIntr->getMemOperand(), DAG);
22800 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
22801 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22803 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
22804 VMask, MemVT, MemIntr->getMemOperand(), DAG);
22807 llvm_unreachable("Unsupported truncstore intrinsic");
22813 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
22814 SelectionDAG &DAG) const {
22815 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
22816 MFI.setReturnAddressIsTaken(true);
22818 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
22821 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
22823 EVT PtrVT = getPointerTy(DAG.getDataLayout());
22826 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
22827 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
22828 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
22829 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
22830 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
22831 MachinePointerInfo());
22834 // Just load the return address.
22835 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
22836 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
22837 MachinePointerInfo());
22840 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
22841 SelectionDAG &DAG) const {
22842 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
22843 return getReturnAddressFrameIndex(DAG);
22846 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
22847 MachineFunction &MF = DAG.getMachineFunction();
22848 MachineFrameInfo &MFI = MF.getFrameInfo();
22849 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
22850 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
22851 EVT VT = Op.getValueType();
22853 MFI.setFrameAddressIsTaken(true);
22855 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
22856 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
22857 // is not possible to crawl up the stack without looking at the unwind codes
22859 int FrameAddrIndex = FuncInfo->getFAIndex();
22860 if (!FrameAddrIndex) {
22861 // Set up a frame object for the return address.
22862 unsigned SlotSize = RegInfo->getSlotSize();
22863 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
22864 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
22865 FuncInfo->setFAIndex(FrameAddrIndex);
22867 return DAG.getFrameIndex(FrameAddrIndex, VT);
22870 unsigned FrameReg =
22871 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
22872 SDLoc dl(Op); // FIXME probably not meaningful
22873 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
22874 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
22875 (FrameReg == X86::EBP && VT == MVT::i32)) &&
22876 "Invalid Frame Register!");
22877 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
22879 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
22880 MachinePointerInfo());
22884 // FIXME? Maybe this could be a TableGen attribute on some registers and
22885 // this table could be generated automatically from RegInfo.
22886 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
22887 SelectionDAG &DAG) const {
22888 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
22889 const MachineFunction &MF = DAG.getMachineFunction();
22891 unsigned Reg = StringSwitch<unsigned>(RegName)
22892 .Case("esp", X86::ESP)
22893 .Case("rsp", X86::RSP)
22894 .Case("ebp", X86::EBP)
22895 .Case("rbp", X86::RBP)
22898 if (Reg == X86::EBP || Reg == X86::RBP) {
22899 if (!TFI.hasFP(MF))
22900 report_fatal_error("register " + StringRef(RegName) +
22901 " is allocatable: function has no frame pointer");
22904 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
22905 unsigned FrameReg =
22906 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
22907 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
22908 "Invalid Frame Register!");
22916 report_fatal_error("Invalid register name global variable");
22919 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
22920 SelectionDAG &DAG) const {
22921 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
22922 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
22925 unsigned X86TargetLowering::getExceptionPointerRegister(
22926 const Constant *PersonalityFn) const {
22927 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
22928 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
22930 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
22933 unsigned X86TargetLowering::getExceptionSelectorRegister(
22934 const Constant *PersonalityFn) const {
22935 // Funclet personalities don't use selectors (the runtime does the selection).
22936 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
22937 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
22940 bool X86TargetLowering::needsFixedCatchObjects() const {
22941 return Subtarget.isTargetWin64();
22944 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
22945 SDValue Chain = Op.getOperand(0);
22946 SDValue Offset = Op.getOperand(1);
22947 SDValue Handler = Op.getOperand(2);
22950 EVT PtrVT = getPointerTy(DAG.getDataLayout());
22951 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
22952 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
22953 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
22954 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
22955 "Invalid Frame Register!");
22956 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
22957 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
22959 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
22960 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
22962 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
22963 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
22964 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
22966 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
22967 DAG.getRegister(StoreAddrReg, PtrVT));
22970 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
22971 SelectionDAG &DAG) const {
22973 // If the subtarget is not 64bit, we may need the global base reg
22974 // after isel expand pseudo, i.e., after CGBR pass ran.
22975 // Therefore, ask for the GlobalBaseReg now, so that the pass
22976 // inserts the code for us in case we need it.
22977 // Otherwise, we will end up in a situation where we will
22978 // reference a virtual register that is not defined!
22979 if (!Subtarget.is64Bit()) {
22980 const X86InstrInfo *TII = Subtarget.getInstrInfo();
22981 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
22983 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
22984 DAG.getVTList(MVT::i32, MVT::Other),
22985 Op.getOperand(0), Op.getOperand(1));
22988 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
22989 SelectionDAG &DAG) const {
22991 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
22992 Op.getOperand(0), Op.getOperand(1));
22995 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
22996 SelectionDAG &DAG) const {
22998 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
23002 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
23003 return Op.getOperand(0);
23006 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
23007 SelectionDAG &DAG) const {
23008 SDValue Root = Op.getOperand(0);
23009 SDValue Trmp = Op.getOperand(1); // trampoline
23010 SDValue FPtr = Op.getOperand(2); // nested function
23011 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
23014 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
23015 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
23017 if (Subtarget.is64Bit()) {
23018 SDValue OutChains[6];
23020 // Large code-model.
23021 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
23022 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
23024 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
23025 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
23027 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
23029 // Load the pointer to the nested function into R11.
23030 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
23031 SDValue Addr = Trmp;
23032 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
23033 Addr, MachinePointerInfo(TrmpAddr));
23035 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
23036 DAG.getConstant(2, dl, MVT::i64));
23038 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
23039 /* Alignment = */ 2);
23041 // Load the 'nest' parameter value into R10.
23042 // R10 is specified in X86CallingConv.td
23043 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
23044 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
23045 DAG.getConstant(10, dl, MVT::i64));
23046 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
23047 Addr, MachinePointerInfo(TrmpAddr, 10));
23049 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
23050 DAG.getConstant(12, dl, MVT::i64));
23052 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
23053 /* Alignment = */ 2);
23055 // Jump to the nested function.
23056 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
23057 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
23058 DAG.getConstant(20, dl, MVT::i64));
23059 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
23060 Addr, MachinePointerInfo(TrmpAddr, 20));
23062 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
23063 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
23064 DAG.getConstant(22, dl, MVT::i64));
23065 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
23066 Addr, MachinePointerInfo(TrmpAddr, 22));
23068 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
23070 const Function *Func =
23071 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
23072 CallingConv::ID CC = Func->getCallingConv();
23077 llvm_unreachable("Unsupported calling convention");
23078 case CallingConv::C:
23079 case CallingConv::X86_StdCall: {
23080 // Pass 'nest' parameter in ECX.
23081 // Must be kept in sync with X86CallingConv.td
23082 NestReg = X86::ECX;
23084 // Check that ECX wasn't needed by an 'inreg' parameter.
23085 FunctionType *FTy = Func->getFunctionType();
23086 const AttributeList &Attrs = Func->getAttributes();
23088 if (!Attrs.isEmpty() && !Func->isVarArg()) {
23089 unsigned InRegCount = 0;
23092 for (FunctionType::param_iterator I = FTy->param_begin(),
23093 E = FTy->param_end(); I != E; ++I, ++Idx)
23094 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
23095 auto &DL = DAG.getDataLayout();
23096 // FIXME: should only count parameters that are lowered to integers.
23097 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
23100 if (InRegCount > 2) {
23101 report_fatal_error("Nest register in use - reduce number of inreg"
23107 case CallingConv::X86_FastCall:
23108 case CallingConv::X86_ThisCall:
23109 case CallingConv::Fast:
23110 // Pass 'nest' parameter in EAX.
23111 // Must be kept in sync with X86CallingConv.td
23112 NestReg = X86::EAX;
23116 SDValue OutChains[4];
23117 SDValue Addr, Disp;
23119 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
23120 DAG.getConstant(10, dl, MVT::i32));
23121 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
23123 // This is storing the opcode for MOV32ri.
23124 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
23125 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
23127 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
23128 Trmp, MachinePointerInfo(TrmpAddr));
23130 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
23131 DAG.getConstant(1, dl, MVT::i32));
23133 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
23134 /* Alignment = */ 1);
23136 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
23137 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
23138 DAG.getConstant(5, dl, MVT::i32));
23139 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
23140 Addr, MachinePointerInfo(TrmpAddr, 5),
23141 /* Alignment = */ 1);
23143 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
23144 DAG.getConstant(6, dl, MVT::i32));
23146 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
23147 /* Alignment = */ 1);
23149 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
23153 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
23154 SelectionDAG &DAG) const {
23156 The rounding mode is in bits 11:10 of FPSR, and has the following
23158 00 Round to nearest
23163 FLT_ROUNDS, on the other hand, expects the following:
23170 To perform the conversion, we do:
23171 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
23174 MachineFunction &MF = DAG.getMachineFunction();
23175 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
23176 unsigned StackAlignment = TFI.getStackAlignment();
23177 MVT VT = Op.getSimpleValueType();
23180 // Save FP Control Word to stack slot
23181 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
23182 SDValue StackSlot =
23183 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
23185 MachineMemOperand *MMO =
23186 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
23187 MachineMemOperand::MOStore, 2, 2);
23189 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
23190 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
23191 DAG.getVTList(MVT::Other),
23192 Ops, MVT::i16, MMO);
23194 // Load FP Control Word from stack slot
23196 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
23198 // Transform as necessary
23200 DAG.getNode(ISD::SRL, DL, MVT::i16,
23201 DAG.getNode(ISD::AND, DL, MVT::i16,
23202 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
23203 DAG.getConstant(11, DL, MVT::i8));
23205 DAG.getNode(ISD::SRL, DL, MVT::i16,
23206 DAG.getNode(ISD::AND, DL, MVT::i16,
23207 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
23208 DAG.getConstant(9, DL, MVT::i8));
23211 DAG.getNode(ISD::AND, DL, MVT::i16,
23212 DAG.getNode(ISD::ADD, DL, MVT::i16,
23213 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
23214 DAG.getConstant(1, DL, MVT::i16)),
23215 DAG.getConstant(3, DL, MVT::i16));
23217 return DAG.getNode((VT.getSizeInBits() < 16 ?
23218 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
23221 // Split an unary integer op into 2 half sized ops.
23222 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
23223 MVT VT = Op.getSimpleValueType();
23224 unsigned NumElems = VT.getVectorNumElements();
23225 unsigned SizeInBits = VT.getSizeInBits();
23226 MVT EltVT = VT.getVectorElementType();
23227 SDValue Src = Op.getOperand(0);
23228 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
23229 "Src and Op should have the same element type!");
23231 // Extract the Lo/Hi vectors
23233 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
23234 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
23236 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
23237 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23238 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
23239 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
23242 // Decompose 256-bit ops into smaller 128-bit ops.
23243 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
23244 assert(Op.getSimpleValueType().is256BitVector() &&
23245 Op.getSimpleValueType().isInteger() &&
23246 "Only handle AVX 256-bit vector integer operation");
23247 return LowerVectorIntUnary(Op, DAG);
23250 // Decompose 512-bit ops into smaller 256-bit ops.
23251 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
23252 assert(Op.getSimpleValueType().is512BitVector() &&
23253 Op.getSimpleValueType().isInteger() &&
23254 "Only handle AVX 512-bit vector integer operation");
23255 return LowerVectorIntUnary(Op, DAG);
23258 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
23260 // i8/i16 vector implemented using dword LZCNT vector instruction
23261 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
23262 // split the vector, perform operation on it's Lo a Hi part and
23263 // concatenate the results.
23264 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
23265 const X86Subtarget &Subtarget) {
23266 assert(Op.getOpcode() == ISD::CTLZ);
23268 MVT VT = Op.getSimpleValueType();
23269 MVT EltVT = VT.getVectorElementType();
23270 unsigned NumElems = VT.getVectorNumElements();
23272 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
23273 "Unsupported element type");
23275 // Split vector, it's Lo and Hi parts will be handled in next iteration.
23276 if (NumElems > 16 ||
23277 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
23278 return LowerVectorIntUnary(Op, DAG);
23280 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
23281 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
23282 "Unsupported value type for operation");
23284 // Use native supported vector instruction vplzcntd.
23285 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
23286 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
23287 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
23288 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
23290 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
23293 // Lower CTLZ using a PSHUFB lookup table implementation.
23294 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
23295 const X86Subtarget &Subtarget,
23296 SelectionDAG &DAG) {
23297 MVT VT = Op.getSimpleValueType();
23298 int NumElts = VT.getVectorNumElements();
23299 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
23300 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
23302 // Per-nibble leading zero PSHUFB lookup table.
23303 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
23304 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
23305 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
23306 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
23308 SmallVector<SDValue, 64> LUTVec;
23309 for (int i = 0; i < NumBytes; ++i)
23310 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23311 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
23313 // Begin by bitcasting the input to byte vector, then split those bytes
23314 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
23315 // If the hi input nibble is zero then we add both results together, otherwise
23316 // we just take the hi result (by masking the lo result to zero before the
23318 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
23319 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
23321 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
23323 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
23325 if (CurrVT.is512BitVector()) {
23326 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
23327 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
23328 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
23330 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
23333 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
23334 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
23335 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
23336 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
23338 // Merge result back from vXi8 back to VT, working on the lo/hi halves
23339 // of the current vector width in the same way we did for the nibbles.
23340 // If the upper half of the input element is zero then add the halves'
23341 // leading zero counts together, otherwise just use the upper half's.
23342 // Double the width of the result until we are at target width.
23343 while (CurrVT != VT) {
23344 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
23345 int CurrNumElts = CurrVT.getVectorNumElements();
23346 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
23347 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
23348 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
23350 // Check if the upper half of the input element is zero.
23351 if (CurrVT.is512BitVector()) {
23352 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
23353 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
23354 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
23355 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
23357 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
23358 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
23360 HiZ = DAG.getBitcast(NextVT, HiZ);
23362 // Move the upper/lower halves to the lower bits as we'll be extending to
23363 // NextVT. Mask the lower result to zero if HiZ is true and add the results
23365 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
23366 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
23367 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
23368 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
23369 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
23376 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
23377 const X86Subtarget &Subtarget,
23378 SelectionDAG &DAG) {
23379 MVT VT = Op.getSimpleValueType();
23381 if (Subtarget.hasCDI() &&
23382 // vXi8 vectors need to be promoted to 512-bits for vXi32.
23383 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
23384 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
23386 // Decompose 256-bit ops into smaller 128-bit ops.
23387 if (VT.is256BitVector() && !Subtarget.hasInt256())
23388 return Lower256IntUnary(Op, DAG);
23390 // Decompose 512-bit ops into smaller 256-bit ops.
23391 if (VT.is512BitVector() && !Subtarget.hasBWI())
23392 return Lower512IntUnary(Op, DAG);
23394 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
23395 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
23398 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
23399 SelectionDAG &DAG) {
23400 MVT VT = Op.getSimpleValueType();
23402 unsigned NumBits = VT.getSizeInBits();
23404 unsigned Opc = Op.getOpcode();
23407 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
23409 Op = Op.getOperand(0);
23410 if (VT == MVT::i8) {
23411 // Zero extend to i32 since there is not an i8 bsr.
23413 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
23416 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
23417 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
23418 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
23420 if (Opc == ISD::CTLZ) {
23421 // If src is zero (i.e. bsr sets ZF), returns NumBits.
23424 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
23425 DAG.getConstant(X86::COND_E, dl, MVT::i8),
23428 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
23431 // Finally xor with NumBits-1.
23432 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
23433 DAG.getConstant(NumBits - 1, dl, OpVT));
23436 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
23440 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
23441 SelectionDAG &DAG) {
23442 MVT VT = Op.getSimpleValueType();
23443 unsigned NumBits = VT.getScalarSizeInBits();
23444 SDValue N0 = Op.getOperand(0);
23447 // Decompose 256-bit ops into smaller 128-bit ops.
23448 if (VT.is256BitVector() && !Subtarget.hasInt256())
23449 return Lower256IntUnary(Op, DAG);
23451 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
23452 "Only scalar CTTZ requires custom lowering");
23454 // Issue a bsf (scan bits forward) which also sets EFLAGS.
23455 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23456 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
23458 // If src is zero (i.e. bsf sets ZF), returns NumBits.
23461 DAG.getConstant(NumBits, dl, VT),
23462 DAG.getConstant(X86::COND_E, dl, MVT::i8),
23465 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
23468 /// Break a 256-bit integer operation into two new 128-bit ones and then
23469 /// concatenate the result back.
23470 static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
23471 MVT VT = Op.getSimpleValueType();
23473 assert(VT.is256BitVector() && VT.isInteger() &&
23474 "Unsupported value type for operation");
23476 unsigned NumElems = VT.getVectorNumElements();
23479 // Extract the LHS vectors
23480 SDValue LHS = Op.getOperand(0);
23481 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
23482 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
23484 // Extract the RHS vectors
23485 SDValue RHS = Op.getOperand(1);
23486 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
23487 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
23489 MVT EltVT = VT.getVectorElementType();
23490 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
23492 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23493 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
23494 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
23497 /// Break a 512-bit integer operation into two new 256-bit ones and then
23498 /// concatenate the result back.
23499 static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
23500 MVT VT = Op.getSimpleValueType();
23502 assert(VT.is512BitVector() && VT.isInteger() &&
23503 "Unsupported value type for operation");
23505 unsigned NumElems = VT.getVectorNumElements();
23508 // Extract the LHS vectors
23509 SDValue LHS = Op.getOperand(0);
23510 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
23511 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
23513 // Extract the RHS vectors
23514 SDValue RHS = Op.getOperand(1);
23515 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
23516 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
23518 MVT EltVT = VT.getVectorElementType();
23519 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
23521 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23522 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
23523 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
23526 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
23527 const X86Subtarget &Subtarget) {
23528 MVT VT = Op.getSimpleValueType();
23529 if (VT == MVT::i16 || VT == MVT::i32)
23530 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
23532 if (VT.getScalarType() == MVT::i1)
23533 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
23534 Op.getOperand(0), Op.getOperand(1));
23536 assert(Op.getSimpleValueType().is256BitVector() &&
23537 Op.getSimpleValueType().isInteger() &&
23538 "Only handle AVX 256-bit vector integer operation");
23539 return split256IntArith(Op, DAG);
23542 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) {
23543 MVT VT = Op.getSimpleValueType();
23544 if (VT.getScalarType() == MVT::i1) {
23546 switch (Op.getOpcode()) {
23547 default: llvm_unreachable("Expected saturated arithmetic opcode");
23550 return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1));
23553 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
23554 DAG.getNOT(dl, Op.getOperand(1), VT));
23558 assert(Op.getSimpleValueType().is256BitVector() &&
23559 Op.getSimpleValueType().isInteger() &&
23560 "Only handle AVX 256-bit vector integer operation");
23561 return split256IntArith(Op, DAG);
23564 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
23565 SelectionDAG &DAG) {
23566 MVT VT = Op.getSimpleValueType();
23567 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
23568 // Since X86 does not have CMOV for 8-bit integer, we don't convert
23569 // 8-bit integer abs to NEG and CMOV.
23571 SDValue N0 = Op.getOperand(0);
23572 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
23573 DAG.getConstant(0, DL, VT), N0);
23574 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
23575 SDValue(Neg.getNode(), 1)};
23576 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
23579 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
23580 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
23582 SDValue Src = Op.getOperand(0);
23584 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
23585 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
23588 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
23589 assert(VT.isInteger() &&
23590 "Only handle AVX 256-bit vector integer operation");
23591 return Lower256IntUnary(Op, DAG);
23594 // Default to expand.
23598 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
23599 MVT VT = Op.getSimpleValueType();
23601 // For AVX1 cases, split to use legal ops (everything but v4i64).
23602 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
23603 return split256IntArith(Op, DAG);
23606 unsigned Opcode = Op.getOpcode();
23607 SDValue N0 = Op.getOperand(0);
23608 SDValue N1 = Op.getOperand(1);
23610 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
23611 // using the SMIN/SMAX instructions and flipping the signbit back.
23612 if (VT == MVT::v8i16) {
23613 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
23614 "Unexpected MIN/MAX opcode");
23615 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
23616 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
23617 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
23618 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
23619 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
23620 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
23623 // Else, expand to a compare/select.
23626 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
23627 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
23628 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
23629 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
23630 default: llvm_unreachable("Unknown MINMAX opcode");
23633 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
23634 return DAG.getSelect(DL, VT, Cond, N0, N1);
23637 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
23638 SelectionDAG &DAG) {
23640 MVT VT = Op.getSimpleValueType();
23642 if (VT.getScalarType() == MVT::i1)
23643 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
23645 // Decompose 256-bit ops into 128-bit ops.
23646 if (VT.is256BitVector() && !Subtarget.hasInt256())
23647 return split256IntArith(Op, DAG);
23649 SDValue A = Op.getOperand(0);
23650 SDValue B = Op.getOperand(1);
23652 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
23653 // vector pairs, multiply and truncate.
23654 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
23655 unsigned NumElts = VT.getVectorNumElements();
23657 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
23658 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
23659 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
23660 return DAG.getNode(
23661 ISD::TRUNCATE, dl, VT,
23662 DAG.getNode(ISD::MUL, dl, ExVT,
23663 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
23664 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
23667 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
23669 // Extract the lo/hi parts to any extend to i16.
23670 // We're going to mask off the low byte of each result element of the
23671 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
23673 SDValue Undef = DAG.getUNDEF(VT);
23674 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
23675 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
23678 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
23679 // If the LHS is a constant, manually unpackl/unpackh.
23680 SmallVector<SDValue, 16> LoOps, HiOps;
23681 for (unsigned i = 0; i != NumElts; i += 16) {
23682 for (unsigned j = 0; j != 8; ++j) {
23683 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
23685 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
23690 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
23691 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
23693 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
23694 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
23697 // Multiply, mask the lower 8bits of the lo/hi results and pack.
23698 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
23699 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
23700 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
23701 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
23702 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23705 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
23706 if (VT == MVT::v4i32) {
23707 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
23708 "Should not custom lower when pmulld is available!");
23710 // Extract the odd parts.
23711 static const int UnpackMask[] = { 1, -1, 3, -1 };
23712 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
23713 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
23715 // Multiply the even parts.
23716 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
23717 DAG.getBitcast(MVT::v2i64, A),
23718 DAG.getBitcast(MVT::v2i64, B));
23719 // Now multiply odd parts.
23720 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
23721 DAG.getBitcast(MVT::v2i64, Aodds),
23722 DAG.getBitcast(MVT::v2i64, Bodds));
23724 Evens = DAG.getBitcast(VT, Evens);
23725 Odds = DAG.getBitcast(VT, Odds);
23727 // Merge the two vectors back together with a shuffle. This expands into 2
23729 static const int ShufMask[] = { 0, 4, 2, 6 };
23730 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
23733 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
23734 "Only know how to lower V2I64/V4I64/V8I64 multiply");
23735 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
23737 // Ahi = psrlqi(a, 32);
23738 // Bhi = psrlqi(b, 32);
23740 // AloBlo = pmuludq(a, b);
23741 // AloBhi = pmuludq(a, Bhi);
23742 // AhiBlo = pmuludq(Ahi, b);
23744 // Hi = psllqi(AloBhi + AhiBlo, 32);
23745 // return AloBlo + Hi;
23746 KnownBits AKnown = DAG.computeKnownBits(A);
23747 KnownBits BKnown = DAG.computeKnownBits(B);
23749 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
23750 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
23751 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
23753 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
23754 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
23755 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
23757 SDValue Zero = DAG.getConstant(0, dl, VT);
23759 // Only multiply lo/hi halves that aren't known to be zero.
23760 SDValue AloBlo = Zero;
23761 if (!ALoIsZero && !BLoIsZero)
23762 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
23764 SDValue AloBhi = Zero;
23765 if (!ALoIsZero && !BHiIsZero) {
23766 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
23767 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
23770 SDValue AhiBlo = Zero;
23771 if (!AHiIsZero && !BLoIsZero) {
23772 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
23773 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
23776 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
23777 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
23779 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
23782 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
23783 SelectionDAG &DAG) {
23785 MVT VT = Op.getSimpleValueType();
23786 bool IsSigned = Op->getOpcode() == ISD::MULHS;
23787 unsigned NumElts = VT.getVectorNumElements();
23788 SDValue A = Op.getOperand(0);
23789 SDValue B = Op.getOperand(1);
23791 // Decompose 256-bit ops into 128-bit ops.
23792 if (VT.is256BitVector() && !Subtarget.hasInt256())
23793 return split256IntArith(Op, DAG);
23795 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
23796 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
23797 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
23798 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
23800 // PMULxD operations multiply each even value (starting at 0) of LHS with
23801 // the related value of RHS and produce a widen result.
23802 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
23803 // => <2 x i64> <ae|cg>
23805 // In other word, to have all the results, we need to perform two PMULxD:
23806 // 1. one with the even values.
23807 // 2. one with the odd values.
23808 // To achieve #2, with need to place the odd values at an even position.
23810 // Place the odd value at an even position (basically, shift all values 1
23811 // step to the left):
23812 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
23813 9, -1, 11, -1, 13, -1, 15, -1};
23814 // <a|b|c|d> => <b|undef|d|undef>
23815 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
23816 makeArrayRef(&Mask[0], NumElts));
23817 // <e|f|g|h> => <f|undef|h|undef>
23818 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
23819 makeArrayRef(&Mask[0], NumElts));
23821 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
23823 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
23825 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
23826 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
23827 // => <2 x i64> <ae|cg>
23828 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
23829 DAG.getBitcast(MulVT, A),
23830 DAG.getBitcast(MulVT, B)));
23831 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
23832 // => <2 x i64> <bf|dh>
23833 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
23834 DAG.getBitcast(MulVT, Odd0),
23835 DAG.getBitcast(MulVT, Odd1)));
23837 // Shuffle it back into the right order.
23838 SmallVector<int, 16> ShufMask(NumElts);
23839 for (int i = 0; i != (int)NumElts; ++i)
23840 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
23842 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
23844 // If we have a signed multiply but no PMULDQ fix up the result of an
23845 // unsigned multiply.
23846 if (IsSigned && !Subtarget.hasSSE41()) {
23847 SDValue Zero = DAG.getConstant(0, dl, VT);
23848 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
23849 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
23850 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
23851 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
23853 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
23854 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
23860 // Only i8 vectors should need custom lowering after this.
23861 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
23862 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
23863 "Unsupported vector type");
23865 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
23866 // logical shift down the upper half and pack back to i8.
23868 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
23869 // and then ashr/lshr the upper bits down to the lower bits before multiply.
23870 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23872 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
23873 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
23874 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
23875 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
23876 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
23877 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
23878 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
23879 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
23882 // For signed 512-bit vectors, split into 256-bit vectors to allow the
23883 // sign-extension to occur.
23884 if (VT == MVT::v64i8 && IsSigned)
23885 return split512IntArith(Op, DAG);
23887 // Signed AVX2 implementation - extend xmm subvectors to ymm.
23888 if (VT == MVT::v32i8 && IsSigned) {
23889 SDValue Lo = DAG.getIntPtrConstant(0, dl);
23890 SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl);
23892 MVT ExVT = MVT::v16i16;
23893 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
23894 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
23895 SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
23896 SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
23897 ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
23898 BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
23899 AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
23900 BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
23901 Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
23902 Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
23903 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
23904 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
23906 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
23907 // Shuffle lowering should turn this into PACKUS+PERMQ
23908 Lo = DAG.getBitcast(VT, Lo);
23909 Hi = DAG.getBitcast(VT, Hi);
23910 return DAG.getVectorShuffle(VT, dl, Lo, Hi,
23911 { 0, 2, 4, 6, 8, 10, 12, 14,
23912 16, 18, 20, 22, 24, 26, 28, 30,
23913 32, 34, 36, 38, 40, 42, 44, 46,
23914 48, 50, 52, 54, 56, 58, 60, 62});
23917 // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
23918 // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
23919 // shift the results and pack the half lane results back together.
23921 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
23923 static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
23924 -1, -1, -1, -1, -1, -1, -1, -1};
23926 // Extract the lo parts and zero/sign extend to i16.
23927 // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
23928 // shifts to sign extend. Using unpack for unsigned only requires an xor to
23929 // create zeros and a copy due to tied registers contraints pre-avx. But using
23930 // zero_extend_vector_inreg would require an additional pshufd for the high
23934 if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
23935 ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
23937 AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
23938 AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
23939 } else if (IsSigned) {
23940 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
23941 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
23943 ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
23944 AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
23946 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
23947 DAG.getConstant(0, dl, VT)));
23948 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
23949 DAG.getConstant(0, dl, VT)));
23953 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
23954 // If the LHS is a constant, manually unpackl/unpackh and extend.
23955 SmallVector<SDValue, 16> LoOps, HiOps;
23956 for (unsigned i = 0; i != NumElts; i += 16) {
23957 for (unsigned j = 0; j != 8; ++j) {
23958 SDValue LoOp = B.getOperand(i + j);
23959 SDValue HiOp = B.getOperand(i + j + 8);
23962 LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
23963 HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
23965 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
23966 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
23969 LoOps.push_back(LoOp);
23970 HiOps.push_back(HiOp);
23974 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
23975 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
23976 } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
23977 BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
23979 BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
23980 BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
23981 } else if (IsSigned) {
23982 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
23983 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
23985 BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
23986 BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
23988 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
23989 DAG.getConstant(0, dl, VT)));
23990 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
23991 DAG.getConstant(0, dl, VT)));
23994 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
23995 // pack back to vXi8.
23996 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
23997 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
23998 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
23999 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
24001 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
24002 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
24005 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
24006 assert(Subtarget.isTargetWin64() && "Unexpected target");
24007 EVT VT = Op.getValueType();
24008 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
24009 "Unexpected return type for lowering");
24013 switch (Op->getOpcode()) {
24014 default: llvm_unreachable("Unexpected request for libcall!");
24015 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
24016 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
24017 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
24018 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
24019 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
24020 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
24024 SDValue InChain = DAG.getEntryNode();
24026 TargetLowering::ArgListTy Args;
24027 TargetLowering::ArgListEntry Entry;
24028 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
24029 EVT ArgVT = Op->getOperand(i).getValueType();
24030 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
24031 "Unexpected argument type for lowering");
24032 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
24033 Entry.Node = StackPtr;
24034 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
24035 MachinePointerInfo(), /* Alignment = */ 16);
24036 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24037 Entry.Ty = PointerType::get(ArgTy,0);
24038 Entry.IsSExt = false;
24039 Entry.IsZExt = false;
24040 Args.push_back(Entry);
24043 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
24044 getPointerTy(DAG.getDataLayout()));
24046 TargetLowering::CallLoweringInfo CLI(DAG);
24047 CLI.setDebugLoc(dl)
24050 getLibcallCallingConv(LC),
24051 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
24054 .setSExtResult(isSigned)
24055 .setZExtResult(!isSigned);
24057 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
24058 return DAG.getBitcast(VT, CallInfo.first);
24061 // Return true if the required (according to Opcode) shift-imm form is natively
24062 // supported by the Subtarget
24063 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
24065 if (VT.getScalarSizeInBits() < 16)
24068 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
24069 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
24072 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
24073 (VT.is256BitVector() && Subtarget.hasInt256());
24075 bool AShift = LShift && (Subtarget.hasAVX512() ||
24076 (VT != MVT::v2i64 && VT != MVT::v4i64));
24077 return (Opcode == ISD::SRA) ? AShift : LShift;
24080 // The shift amount is a variable, but it is the same for all vector lanes.
24081 // These instructions are defined together with shift-immediate.
24083 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
24085 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
24088 // Return true if the required (according to Opcode) variable-shift form is
24089 // natively supported by the Subtarget
24090 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
24093 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
24096 // vXi16 supported only on AVX-512, BWI
24097 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
24100 if (Subtarget.hasAVX512())
24103 bool LShift = VT.is128BitVector() || VT.is256BitVector();
24104 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
24105 return (Opcode == ISD::SRA) ? AShift : LShift;
24108 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
24109 const X86Subtarget &Subtarget) {
24110 MVT VT = Op.getSimpleValueType();
24112 SDValue R = Op.getOperand(0);
24113 SDValue Amt = Op.getOperand(1);
24114 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
24116 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
24117 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
24118 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
24119 SDValue Ex = DAG.getBitcast(ExVT, R);
24121 // ashr(R, 63) === cmp_slt(R, 0)
24122 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
24123 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
24124 "Unsupported PCMPGT op");
24125 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
24128 if (ShiftAmt >= 32) {
24129 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
24131 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
24132 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
24133 ShiftAmt - 32, DAG);
24134 if (VT == MVT::v2i64)
24135 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
24136 if (VT == MVT::v4i64)
24137 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
24138 {9, 1, 11, 3, 13, 5, 15, 7});
24140 // SRA upper i32, SRL whole i64 and select lower i32.
24141 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
24144 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
24145 Lower = DAG.getBitcast(ExVT, Lower);
24146 if (VT == MVT::v2i64)
24147 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
24148 if (VT == MVT::v4i64)
24149 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
24150 {8, 1, 10, 3, 12, 5, 14, 7});
24152 return DAG.getBitcast(VT, Ex);
24155 // Optimize shl/srl/sra with constant shift amount.
24156 APInt APIntShiftAmt;
24157 if (!isConstantSplat(Amt, APIntShiftAmt))
24159 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
24161 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
24162 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
24164 // i64 SRA needs to be performed as partial shifts.
24165 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
24166 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
24167 Op.getOpcode() == ISD::SRA)
24168 return ArithmeticShiftRight64(ShiftAmt);
24170 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
24171 VT == MVT::v64i8) {
24172 unsigned NumElts = VT.getVectorNumElements();
24173 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
24175 // Simple i8 add case
24176 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
24177 return DAG.getNode(ISD::ADD, dl, VT, R, R);
24179 // ashr(R, 7) === cmp_slt(R, 0)
24180 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
24181 SDValue Zeros = DAG.getConstant(0, dl, VT);
24182 if (VT.is512BitVector()) {
24183 assert(VT == MVT::v64i8 && "Unexpected element type!");
24184 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
24185 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
24187 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
24190 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
24191 if (VT == MVT::v16i8 && Subtarget.hasXOP())
24194 if (Op.getOpcode() == ISD::SHL) {
24195 // Make a large shift.
24196 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
24198 SHL = DAG.getBitcast(VT, SHL);
24199 // Zero out the rightmost bits.
24200 return DAG.getNode(ISD::AND, dl, VT, SHL,
24201 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
24203 if (Op.getOpcode() == ISD::SRL) {
24204 // Make a large shift.
24205 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
24207 SRL = DAG.getBitcast(VT, SRL);
24208 // Zero out the leftmost bits.
24209 return DAG.getNode(ISD::AND, dl, VT, SRL,
24210 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
24212 if (Op.getOpcode() == ISD::SRA) {
24213 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
24214 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
24216 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
24217 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
24218 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
24221 llvm_unreachable("Unknown shift opcode.");
24227 // If V is a splat value, return the source vector and splat index;
24228 static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) {
24229 V = peekThroughEXTRACT_SUBVECTORs(V);
24231 EVT VT = V.getValueType();
24232 unsigned Opcode = V.getOpcode();
24236 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
24237 if (DAG.isSplatValue(V, DemandedElts, UndefElts)) {
24238 // Handle case where all demanded elements are UNDEF.
24239 if (DemandedElts.isSubsetOf(UndefElts)) {
24241 return DAG.getUNDEF(VT);
24243 SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
24248 case ISD::VECTOR_SHUFFLE: {
24249 // Check if this is a shuffle node doing a splat.
24250 // TODO - remove this and rely purely on SelectionDAG::isSplatValue,
24251 // getTargetVShiftNode currently struggles without the splat source.
24252 auto *SVN = cast<ShuffleVectorSDNode>(V);
24253 if (!SVN->isSplat())
24255 int Idx = SVN->getSplatIndex();
24256 int NumElts = V.getValueType().getVectorNumElements();
24257 SplatIdx = Idx % NumElts;
24258 return V.getOperand(Idx / NumElts);
24265 static SDValue GetSplatValue(SDValue V, const SDLoc &dl,
24266 SelectionDAG &DAG) {
24268 if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG))
24269 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
24270 SrcVector.getValueType().getScalarType(), SrcVector,
24271 DAG.getIntPtrConstant(SplatIdx, dl));
24275 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
24276 const X86Subtarget &Subtarget) {
24277 MVT VT = Op.getSimpleValueType();
24279 SDValue R = Op.getOperand(0);
24280 SDValue Amt = Op.getOperand(1);
24281 unsigned Opcode = Op.getOpcode();
24282 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
24283 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
24285 if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) {
24286 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
24287 MVT EltVT = VT.getVectorElementType();
24288 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
24289 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
24290 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
24291 else if (EltVT.bitsLT(MVT::i32))
24292 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
24294 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
24297 // vXi8 shifts - shift as v8i16 + mask result.
24298 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
24299 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
24300 VT == MVT::v64i8) &&
24301 !Subtarget.hasXOP()) {
24302 unsigned NumElts = VT.getVectorNumElements();
24303 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
24304 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
24305 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
24306 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
24307 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
24309 // Create the mask using vXi16 shifts. For shift-rights we need to move
24310 // the upper byte down before splatting the vXi8 mask.
24311 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
24312 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
24313 BaseShAmt, Subtarget, DAG);
24314 if (Opcode != ISD::SHL)
24315 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
24317 BitMask = DAG.getBitcast(VT, BitMask);
24318 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
24319 SmallVector<int, 64>(NumElts, 0));
24321 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
24322 DAG.getBitcast(ExtVT, R), BaseShAmt,
24324 Res = DAG.getBitcast(VT, Res);
24325 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
24327 if (Opcode == ISD::SRA) {
24328 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
24329 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
24330 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
24331 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
24332 BaseShAmt, Subtarget, DAG);
24333 SignMask = DAG.getBitcast(VT, SignMask);
24334 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
24335 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
24342 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
24343 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
24344 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
24345 Amt = Amt.getOperand(0);
24346 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
24347 std::vector<SDValue> Vals(Ratio);
24348 for (unsigned i = 0; i != Ratio; ++i)
24349 Vals[i] = Amt.getOperand(i);
24350 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
24351 for (unsigned j = 0; j != Ratio; ++j)
24352 if (Vals[j] != Amt.getOperand(i + j))
24356 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
24357 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
24362 // Convert a shift/rotate left amount to a multiplication scale factor.
24363 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
24364 const X86Subtarget &Subtarget,
24365 SelectionDAG &DAG) {
24366 MVT VT = Amt.getSimpleValueType();
24367 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
24368 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
24369 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
24372 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
24373 SmallVector<SDValue, 8> Elts;
24374 MVT SVT = VT.getVectorElementType();
24375 unsigned SVTBits = SVT.getSizeInBits();
24376 APInt One(SVTBits, 1);
24377 unsigned NumElems = VT.getVectorNumElements();
24379 for (unsigned i = 0; i != NumElems; ++i) {
24380 SDValue Op = Amt->getOperand(i);
24381 if (Op->isUndef()) {
24382 Elts.push_back(Op);
24386 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
24387 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
24388 uint64_t ShAmt = C.getZExtValue();
24389 if (ShAmt >= SVTBits) {
24390 Elts.push_back(DAG.getUNDEF(SVT));
24393 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
24395 return DAG.getBuildVector(VT, dl, Elts);
24398 // If the target doesn't support variable shifts, use either FP conversion
24399 // or integer multiplication to avoid shifting each element individually.
24400 if (VT == MVT::v4i32) {
24401 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
24402 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
24403 DAG.getConstant(0x3f800000U, dl, VT));
24404 Amt = DAG.getBitcast(MVT::v4f32, Amt);
24405 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
24408 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
24409 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
24410 SDValue Z = DAG.getConstant(0, dl, VT);
24411 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
24412 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
24413 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
24414 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
24415 if (Subtarget.hasSSE41())
24416 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
24418 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
24419 DAG.getBitcast(VT, Hi),
24420 {0, 2, 4, 6, 8, 10, 12, 14});
24426 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
24427 SelectionDAG &DAG) {
24428 MVT VT = Op.getSimpleValueType();
24430 SDValue R = Op.getOperand(0);
24431 SDValue Amt = Op.getOperand(1);
24432 unsigned EltSizeInBits = VT.getScalarSizeInBits();
24433 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
24435 unsigned Opc = Op.getOpcode();
24436 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
24437 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
24439 assert(VT.isVector() && "Custom lowering only for vector shifts!");
24440 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
24442 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
24445 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
24448 if (SupportedVectorVarShift(VT, Subtarget, Opc))
24451 // XOP has 128-bit variable logical/arithmetic shifts.
24452 // +ve/-ve Amt = shift left/right.
24453 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
24454 VT == MVT::v8i16 || VT == MVT::v16i8)) {
24455 if (Opc == ISD::SRL || Opc == ISD::SRA) {
24456 SDValue Zero = DAG.getConstant(0, dl, VT);
24457 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
24459 if (Opc == ISD::SHL || Opc == ISD::SRL)
24460 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
24461 if (Opc == ISD::SRA)
24462 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
24465 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
24466 // shifts per-lane and then shuffle the partial results back together.
24467 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
24468 // Splat the shift amounts so the scalar shifts above will catch it.
24469 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
24470 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
24471 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
24472 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
24473 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
24476 // i64 vector arithmetic shift can be emulated with the transform:
24477 // M = lshr(SIGN_MASK, Amt)
24478 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
24479 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
24481 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
24482 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
24483 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
24484 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
24485 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
24489 // If possible, lower this shift as a sequence of two shifts by
24490 // constant plus a BLENDing shuffle instead of scalarizing it.
24492 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
24494 // Could be rewritten as:
24495 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
24497 // The advantage is that the two shifts from the example would be
24498 // lowered as X86ISD::VSRLI nodes in parallel before blending.
24499 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
24500 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
24501 SDValue Amt1, Amt2;
24502 unsigned NumElts = VT.getVectorNumElements();
24503 SmallVector<int, 8> ShuffleMask;
24504 for (unsigned i = 0; i != NumElts; ++i) {
24505 SDValue A = Amt->getOperand(i);
24507 ShuffleMask.push_back(SM_SentinelUndef);
24510 if (!Amt1 || Amt1 == A) {
24511 ShuffleMask.push_back(i);
24515 if (!Amt2 || Amt2 == A) {
24516 ShuffleMask.push_back(i + NumElts);
24523 // Only perform this blend if we can perform it without loading a mask.
24524 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
24525 (VT != MVT::v16i16 ||
24526 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
24527 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
24528 canWidenShuffleElements(ShuffleMask))) {
24529 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
24530 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
24531 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
24532 Cst2->getAPIntValue().ult(EltSizeInBits)) {
24533 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
24534 Cst1->getZExtValue(), DAG);
24535 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
24536 Cst2->getZExtValue(), DAG);
24537 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
24542 // If possible, lower this packed shift into a vector multiply instead of
24543 // expanding it into a sequence of scalar shifts.
24544 if (Opc == ISD::SHL)
24545 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
24546 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
24548 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
24549 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
24550 if (Opc == ISD::SRL && ConstantAmt &&
24551 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
24552 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
24553 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
24554 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
24555 SDValue Zero = DAG.getConstant(0, dl, VT);
24556 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
24557 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
24558 return DAG.getSelect(dl, VT, ZAmt, R, Res);
24562 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
24563 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
24564 // TODO: Special case handling for shift by 0/1, really we can afford either
24565 // of these cases in pre-SSE41/XOP/AVX512 but not both.
24566 if (Opc == ISD::SRA && ConstantAmt &&
24567 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
24568 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
24569 !Subtarget.hasAVX512()) ||
24570 DAG.isKnownNeverZero(Amt))) {
24571 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
24572 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
24573 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
24575 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
24577 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
24579 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
24580 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
24581 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
24582 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
24586 // v4i32 Non Uniform Shifts.
24587 // If the shift amount is constant we can shift each lane using the SSE2
24588 // immediate shifts, else we need to zero-extend each lane to the lower i64
24589 // and shift using the SSE2 variable shifts.
24590 // The separate results can then be blended together.
24591 if (VT == MVT::v4i32) {
24592 SDValue Amt0, Amt1, Amt2, Amt3;
24594 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
24595 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
24596 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
24597 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
24599 // The SSE2 shifts use the lower i64 as the same shift amount for
24600 // all lanes and the upper i64 is ignored. On AVX we're better off
24601 // just zero-extending, but for SSE just duplicating the top 16-bits is
24602 // cheaper and has the same effect for out of range values.
24603 if (Subtarget.hasAVX()) {
24604 SDValue Z = DAG.getConstant(0, dl, VT);
24605 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
24606 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
24607 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
24608 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
24610 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
24611 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
24612 {4, 5, 6, 7, -1, -1, -1, -1});
24613 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
24614 {0, 1, 1, 1, -1, -1, -1, -1});
24615 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
24616 {2, 3, 3, 3, -1, -1, -1, -1});
24617 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
24618 {0, 1, 1, 1, -1, -1, -1, -1});
24619 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
24620 {2, 3, 3, 3, -1, -1, -1, -1});
24624 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
24625 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
24626 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
24627 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
24628 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
24630 // Merge the shifted lane results optimally with/without PBLENDW.
24631 // TODO - ideally shuffle combining would handle this.
24632 if (Subtarget.hasSSE41()) {
24633 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
24634 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
24635 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
24637 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
24638 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
24639 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
24642 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
24643 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
24644 // make the existing SSE solution better.
24645 // NOTE: We honor prefered vector width before promoting to 512-bits.
24646 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
24647 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
24648 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
24649 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
24650 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
24651 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
24652 "Unexpected vector type");
24653 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
24654 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
24655 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
24656 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
24657 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
24658 return DAG.getNode(ISD::TRUNCATE, dl, VT,
24659 DAG.getNode(Opc, dl, ExtVT, R, Amt));
24662 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
24663 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
24664 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
24665 (VT == MVT::v16i8 || VT == MVT::v64i8 ||
24666 (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
24667 !Subtarget.hasXOP()) {
24668 int NumElts = VT.getVectorNumElements();
24669 SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8);
24671 // Extend constant shift amount to vXi16 (it doesn't matter if the type
24673 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
24674 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
24675 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
24676 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
24677 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
24678 "Constant build vector expected");
24680 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
24681 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
24682 : DAG.getZExtOrTrunc(R, dl, ExVT);
24683 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
24684 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
24685 return DAG.getZExtOrTrunc(R, dl, VT);
24688 SmallVector<SDValue, 16> LoAmt, HiAmt;
24689 for (int i = 0; i != NumElts; i += 16) {
24690 for (int j = 0; j != 8; ++j) {
24691 LoAmt.push_back(Amt.getOperand(i + j));
24692 HiAmt.push_back(Amt.getOperand(i + j + 8));
24696 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
24697 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
24698 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
24700 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
24701 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
24702 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
24703 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
24704 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
24705 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
24706 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
24707 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
24708 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
24711 if (VT == MVT::v16i8 ||
24712 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
24713 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
24714 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
24716 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
24717 if (VT.is512BitVector()) {
24718 // On AVX512BW targets we make use of the fact that VSELECT lowers
24719 // to a masked blend which selects bytes based just on the sign bit
24720 // extracted to a mask.
24721 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
24722 V0 = DAG.getBitcast(VT, V0);
24723 V1 = DAG.getBitcast(VT, V1);
24724 Sel = DAG.getBitcast(VT, Sel);
24725 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
24727 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
24728 } else if (Subtarget.hasSSE41()) {
24729 // On SSE41 targets we make use of the fact that VSELECT lowers
24730 // to PBLENDVB which selects bytes based just on the sign bit.
24731 V0 = DAG.getBitcast(VT, V0);
24732 V1 = DAG.getBitcast(VT, V1);
24733 Sel = DAG.getBitcast(VT, Sel);
24734 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
24736 // On pre-SSE41 targets we test for the sign bit by comparing to
24737 // zero - a negative value will set all bits of the lanes to true
24738 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
24739 SDValue Z = DAG.getConstant(0, dl, SelVT);
24740 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
24741 return DAG.getSelect(dl, SelVT, C, V0, V1);
24744 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
24745 // We can safely do this using i16 shifts as we're only interested in
24746 // the 3 lower bits of each byte.
24747 Amt = DAG.getBitcast(ExtVT, Amt);
24748 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
24749 Amt = DAG.getBitcast(VT, Amt);
24751 if (Opc == ISD::SHL || Opc == ISD::SRL) {
24752 // r = VSELECT(r, shift(r, 4), a);
24753 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
24754 R = SignBitSelect(VT, Amt, M, R);
24757 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
24759 // r = VSELECT(r, shift(r, 2), a);
24760 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
24761 R = SignBitSelect(VT, Amt, M, R);
24764 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
24766 // return VSELECT(r, shift(r, 1), a);
24767 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
24768 R = SignBitSelect(VT, Amt, M, R);
24772 if (Opc == ISD::SRA) {
24773 // For SRA we need to unpack each byte to the higher byte of a i16 vector
24774 // so we can correctly sign extend. We don't care what happens to the
24776 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
24777 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
24778 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
24779 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
24780 ALo = DAG.getBitcast(ExtVT, ALo);
24781 AHi = DAG.getBitcast(ExtVT, AHi);
24782 RLo = DAG.getBitcast(ExtVT, RLo);
24783 RHi = DAG.getBitcast(ExtVT, RHi);
24785 // r = VSELECT(r, shift(r, 4), a);
24786 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
24787 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
24788 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
24789 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
24792 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
24793 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
24795 // r = VSELECT(r, shift(r, 2), a);
24796 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
24797 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
24798 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
24799 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
24802 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
24803 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
24805 // r = VSELECT(r, shift(r, 1), a);
24806 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
24807 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
24808 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
24809 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
24811 // Logical shift the result back to the lower byte, leaving a zero upper
24812 // byte meaning that we can safely pack with PACKUSWB.
24813 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
24814 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
24815 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
24819 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
24820 MVT ExtVT = MVT::v8i32;
24821 SDValue Z = DAG.getConstant(0, dl, VT);
24822 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
24823 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
24824 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
24825 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
24826 ALo = DAG.getBitcast(ExtVT, ALo);
24827 AHi = DAG.getBitcast(ExtVT, AHi);
24828 RLo = DAG.getBitcast(ExtVT, RLo);
24829 RHi = DAG.getBitcast(ExtVT, RHi);
24830 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
24831 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
24832 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
24833 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
24834 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
24837 if (VT == MVT::v8i16) {
24838 // If we have a constant shift amount, the non-SSE41 path is best as
24839 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
24840 bool UseSSE41 = Subtarget.hasSSE41() &&
24841 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
24843 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
24844 // On SSE41 targets we make use of the fact that VSELECT lowers
24845 // to PBLENDVB which selects bytes based just on the sign bit.
24847 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
24848 V0 = DAG.getBitcast(ExtVT, V0);
24849 V1 = DAG.getBitcast(ExtVT, V1);
24850 Sel = DAG.getBitcast(ExtVT, Sel);
24851 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
24853 // On pre-SSE41 targets we splat the sign bit - a negative value will
24854 // set all bits of the lanes to true and VSELECT uses that in
24855 // its OR(AND(V0,C),AND(V1,~C)) lowering.
24857 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
24858 return DAG.getSelect(dl, VT, C, V0, V1);
24861 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
24863 // On SSE41 targets we need to replicate the shift mask in both
24864 // bytes for PBLENDVB.
24867 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
24868 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
24870 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
24873 // r = VSELECT(r, shift(r, 8), a);
24874 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
24875 R = SignBitSelect(Amt, M, R);
24878 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
24880 // r = VSELECT(r, shift(r, 4), a);
24881 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
24882 R = SignBitSelect(Amt, M, R);
24885 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
24887 // r = VSELECT(r, shift(r, 2), a);
24888 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
24889 R = SignBitSelect(Amt, M, R);
24892 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
24894 // return VSELECT(r, shift(r, 1), a);
24895 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
24896 R = SignBitSelect(Amt, M, R);
24900 // Decompose 256-bit shifts into 128-bit shifts.
24901 if (VT.is256BitVector())
24902 return split256IntArith(Op, DAG);
24907 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
24908 SelectionDAG &DAG) {
24909 MVT VT = Op.getSimpleValueType();
24910 assert(VT.isVector() && "Custom lowering only for vector rotates!");
24913 SDValue R = Op.getOperand(0);
24914 SDValue Amt = Op.getOperand(1);
24915 unsigned Opcode = Op.getOpcode();
24916 unsigned EltSizeInBits = VT.getScalarSizeInBits();
24917 int NumElts = VT.getVectorNumElements();
24919 // Check for constant splat rotation amount.
24921 SmallVector<APInt, 32> EltBits;
24922 int CstSplatIndex = -1;
24923 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
24924 for (int i = 0; i != NumElts; ++i)
24925 if (!UndefElts[i]) {
24926 if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
24930 CstSplatIndex = -1;
24934 // AVX512 implicitly uses modulo rotation amounts.
24935 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
24936 // Attempt to rotate by immediate.
24937 if (0 <= CstSplatIndex) {
24938 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
24939 uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
24940 return DAG.getNode(Op, DL, VT, R,
24941 DAG.getConstant(RotateAmt, DL, MVT::i8));
24944 // Else, fall-back on VPROLV/VPRORV.
24948 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
24950 // XOP has 128-bit vector variable + immediate rotates.
24951 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
24952 // XOP implicitly uses modulo rotation amounts.
24953 if (Subtarget.hasXOP()) {
24954 if (VT.is256BitVector())
24955 return split256IntArith(Op, DAG);
24956 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
24958 // Attempt to rotate by immediate.
24959 if (0 <= CstSplatIndex) {
24960 uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
24961 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
24962 DAG.getConstant(RotateAmt, DL, MVT::i8));
24965 // Use general rotate by variable (per-element).
24969 // Split 256-bit integers on pre-AVX2 targets.
24970 if (VT.is256BitVector() && !Subtarget.hasAVX2())
24971 return split256IntArith(Op, DAG);
24973 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
24974 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
24975 Subtarget.hasAVX2())) &&
24976 "Only vXi32/vXi16/vXi8 vector rotates supported");
24978 // Rotate by an uniform constant - expand back to shifts.
24979 if (0 <= CstSplatIndex)
24982 bool IsSplatAmt = DAG.isSplatValue(Amt);
24984 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
24986 if (EltSizeInBits == 8 && !IsSplatAmt) {
24987 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
24990 // We don't need ModuloAmt here as we just peek at individual bits.
24991 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
24993 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
24994 if (Subtarget.hasSSE41()) {
24995 // On SSE41 targets we make use of the fact that VSELECT lowers
24996 // to PBLENDVB which selects bytes based just on the sign bit.
24997 V0 = DAG.getBitcast(VT, V0);
24998 V1 = DAG.getBitcast(VT, V1);
24999 Sel = DAG.getBitcast(VT, Sel);
25000 return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
25002 // On pre-SSE41 targets we test for the sign bit by comparing to
25003 // zero - a negative value will set all bits of the lanes to true
25004 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
25005 SDValue Z = DAG.getConstant(0, DL, SelVT);
25006 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
25007 return DAG.getSelect(DL, SelVT, C, V0, V1);
25010 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
25011 // We can safely do this using i16 shifts as we're only interested in
25012 // the 3 lower bits of each byte.
25013 Amt = DAG.getBitcast(ExtVT, Amt);
25014 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
25015 Amt = DAG.getBitcast(VT, Amt);
25017 // r = VSELECT(r, rot(r, 4), a);
25021 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
25022 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
25023 R = SignBitSelect(VT, Amt, M, R);
25026 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
25028 // r = VSELECT(r, rot(r, 2), a);
25031 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
25032 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
25033 R = SignBitSelect(VT, Amt, M, R);
25036 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
25038 // return VSELECT(r, rot(r, 1), a);
25041 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
25042 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
25043 return SignBitSelect(VT, Amt, M, R);
25046 // ISD::ROT* uses modulo rotate amounts.
25047 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
25048 DAG.getConstant(EltSizeInBits - 1, DL, VT));
25050 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
25051 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
25052 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
25054 // Fallback for splats + all supported variable shifts.
25055 // Fallback for non-constants AVX2 vXi16 as well.
25056 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
25057 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
25058 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
25059 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
25060 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
25061 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
25064 // As with shifts, convert the rotation amount to a multiplication factor.
25065 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
25066 assert(Scale && "Failed to convert ROTL amount to scale");
25068 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
25069 if (EltSizeInBits == 16) {
25070 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
25071 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
25072 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
25075 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
25076 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
25077 // that can then be OR'd with the lower 32-bits.
25078 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
25079 static const int OddMask[] = {1, -1, 3, -1};
25080 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
25081 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
25083 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
25084 DAG.getBitcast(MVT::v2i64, R),
25085 DAG.getBitcast(MVT::v2i64, Scale));
25086 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
25087 DAG.getBitcast(MVT::v2i64, R13),
25088 DAG.getBitcast(MVT::v2i64, Scale13));
25089 Res02 = DAG.getBitcast(VT, Res02);
25090 Res13 = DAG.getBitcast(VT, Res13);
25092 return DAG.getNode(ISD::OR, DL, VT,
25093 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
25094 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
25097 /// Returns true if the operand type is exactly twice the native width, and
25098 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
25099 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
25100 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
25101 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
25102 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
25105 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
25106 else if (OpWidth == 128)
25107 return Subtarget.hasCmpxchg16b();
25112 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
25113 return needsCmpXchgNb(SI->getValueOperand()->getType());
25116 // Note: this turns large loads into lock cmpxchg8b/16b.
25117 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
25118 TargetLowering::AtomicExpansionKind
25119 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
25120 auto PTy = cast<PointerType>(LI->getPointerOperandType());
25121 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
25122 : AtomicExpansionKind::None;
25125 TargetLowering::AtomicExpansionKind
25126 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
25127 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
25128 Type *MemType = AI->getType();
25130 // If the operand is too big, we must see if cmpxchg8/16b is available
25131 // and default to library calls otherwise.
25132 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
25133 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
25134 : AtomicExpansionKind::None;
25137 AtomicRMWInst::BinOp Op = AI->getOperation();
25140 llvm_unreachable("Unknown atomic operation");
25141 case AtomicRMWInst::Xchg:
25142 case AtomicRMWInst::Add:
25143 case AtomicRMWInst::Sub:
25144 // It's better to use xadd, xsub or xchg for these in all cases.
25145 return AtomicExpansionKind::None;
25146 case AtomicRMWInst::Or:
25147 case AtomicRMWInst::And:
25148 case AtomicRMWInst::Xor:
25149 // If the atomicrmw's result isn't actually used, we can just add a "lock"
25150 // prefix to a normal instruction for these operations.
25151 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
25152 : AtomicExpansionKind::None;
25153 case AtomicRMWInst::Nand:
25154 case AtomicRMWInst::Max:
25155 case AtomicRMWInst::Min:
25156 case AtomicRMWInst::UMax:
25157 case AtomicRMWInst::UMin:
25158 // These always require a non-trivial set of data operations on x86. We must
25159 // use a cmpxchg loop.
25160 return AtomicExpansionKind::CmpXChg;
25165 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
25166 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
25167 Type *MemType = AI->getType();
25168 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
25169 // there is no benefit in turning such RMWs into loads, and it is actually
25170 // harmful as it introduces a mfence.
25171 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
25174 auto Builder = IRBuilder<>(AI);
25175 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25176 auto SSID = AI->getSyncScopeID();
25177 // We must restrict the ordering to avoid generating loads with Release or
25178 // ReleaseAcquire orderings.
25179 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
25180 auto Ptr = AI->getPointerOperand();
25182 // Before the load we need a fence. Here is an example lifted from
25183 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
25186 // x.store(1, relaxed);
25187 // r1 = y.fetch_add(0, release);
25189 // y.fetch_add(42, acquire);
25190 // r2 = x.load(relaxed);
25191 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
25192 // lowered to just a load without a fence. A mfence flushes the store buffer,
25193 // making the optimization clearly correct.
25194 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
25195 // otherwise, we might be able to be more aggressive on relaxed idempotent
25196 // rmw. In practice, they do not look useful, so we don't try to be
25197 // especially clever.
25198 if (SSID == SyncScope::SingleThread)
25199 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
25200 // the IR level, so we must wrap it in an intrinsic.
25203 if (!Subtarget.hasMFence())
25204 // FIXME: it might make sense to use a locked operation here but on a
25205 // different cache-line to prevent cache-line bouncing. In practice it
25206 // is probably a small win, and x86 processors without mfence are rare
25207 // enough that we do not bother.
25211 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
25212 Builder.CreateCall(MFence, {});
25214 // Finally we can emit the atomic load.
25215 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
25216 AI->getType()->getPrimitiveSizeInBits());
25217 Loaded->setAtomic(Order, SSID);
25218 AI->replaceAllUsesWith(Loaded);
25219 AI->eraseFromParent();
25223 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
25224 SelectionDAG &DAG) {
25226 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
25227 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
25228 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
25229 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
25231 // The only fence that needs an instruction is a sequentially-consistent
25232 // cross-thread fence.
25233 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
25234 FenceSSID == SyncScope::System) {
25235 if (Subtarget.hasMFence())
25236 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
25238 SDValue Chain = Op.getOperand(0);
25239 SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32);
25241 DAG.getRegister(X86::ESP, MVT::i32), // Base
25242 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
25243 DAG.getRegister(0, MVT::i32), // Index
25244 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
25245 DAG.getRegister(0, MVT::i32), // Segment.
25249 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, dl, MVT::Other, Ops);
25250 return SDValue(Res, 0);
25253 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
25254 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
25257 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
25258 SelectionDAG &DAG) {
25259 MVT T = Op.getSimpleValueType();
25263 switch(T.SimpleTy) {
25264 default: llvm_unreachable("Invalid value type!");
25265 case MVT::i8: Reg = X86::AL; size = 1; break;
25266 case MVT::i16: Reg = X86::AX; size = 2; break;
25267 case MVT::i32: Reg = X86::EAX; size = 4; break;
25269 assert(Subtarget.is64Bit() && "Node not type legal!");
25270 Reg = X86::RAX; size = 8;
25273 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
25274 Op.getOperand(2), SDValue());
25275 SDValue Ops[] = { cpIn.getValue(0),
25278 DAG.getTargetConstant(size, DL, MVT::i8),
25279 cpIn.getValue(1) };
25280 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25281 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
25282 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
25286 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
25287 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
25288 MVT::i32, cpOut.getValue(2));
25289 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
25291 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
25292 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
25293 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
25297 // Create MOVMSKB, taking into account whether we need to split for AVX1.
25298 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
25299 const X86Subtarget &Subtarget) {
25300 MVT InVT = V.getSimpleValueType();
25302 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
25304 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
25305 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
25306 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
25307 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
25308 DAG.getConstant(16, DL, MVT::i8));
25309 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
25312 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
25315 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
25316 SelectionDAG &DAG) {
25317 SDValue Src = Op.getOperand(0);
25318 MVT SrcVT = Src.getSimpleValueType();
25319 MVT DstVT = Op.getSimpleValueType();
25321 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
25322 // half to v32i1 and concatenating the result.
25323 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
25324 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
25325 assert(Subtarget.hasBWI() && "Expected BWI target");
25327 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
25328 DAG.getIntPtrConstant(0, dl));
25329 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25330 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
25331 DAG.getIntPtrConstant(1, dl));
25332 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25333 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25336 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
25337 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
25338 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
25341 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
25342 EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
25343 DstVT.getVectorNumElements() / 2);
25344 Lo = DAG.getBitcast(CastVT, Lo);
25345 Hi = DAG.getBitcast(CastVT, Hi);
25346 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
25349 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
25350 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
25351 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
25352 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
25354 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
25355 V = getPMOVMSKB(DL, V, DAG, Subtarget);
25356 return DAG.getZExtOrTrunc(V, DL, DstVT);
25359 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
25360 SrcVT == MVT::i64) {
25361 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25362 if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
25363 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
25364 // This conversion needs to be expanded.
25368 if (SrcVT.isVector()) {
25369 // Widen the vector in input in the case of MVT::v2i32.
25370 // Example: from MVT::v2i32 to MVT::v4i32.
25371 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
25372 SrcVT.getVectorNumElements() * 2);
25373 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
25374 DAG.getUNDEF(SrcVT));
25376 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
25377 "Unexpected source type in LowerBITCAST");
25378 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
25381 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
25382 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
25384 if (DstVT == MVT::x86mmx)
25385 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
25387 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
25388 DAG.getIntPtrConstant(0, dl));
25391 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
25392 Subtarget.hasMMX() && "Unexpected custom BITCAST");
25393 assert((DstVT == MVT::i64 ||
25394 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
25395 "Unexpected custom BITCAST");
25396 // i64 <=> MMX conversions are Legal.
25397 if (SrcVT==MVT::i64 && DstVT.isVector())
25399 if (DstVT==MVT::i64 && SrcVT.isVector())
25401 // MMX <=> MMX conversions are Legal.
25402 if (SrcVT.isVector() && DstVT.isVector())
25404 // All other conversions need to be expanded.
25408 /// Compute the horizontal sum of bytes in V for the elements of VT.
25410 /// Requires V to be a byte vector and VT to be an integer vector type with
25411 /// wider elements than V's type. The width of the elements of VT determines
25412 /// how many bytes of V are summed horizontally to produce each element of the
25414 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
25415 const X86Subtarget &Subtarget,
25416 SelectionDAG &DAG) {
25418 MVT ByteVecVT = V.getSimpleValueType();
25419 MVT EltVT = VT.getVectorElementType();
25420 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
25421 "Expected value to have byte element type.");
25422 assert(EltVT != MVT::i8 &&
25423 "Horizontal byte sum only makes sense for wider elements!");
25424 unsigned VecSize = VT.getSizeInBits();
25425 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
25427 // PSADBW instruction horizontally add all bytes and leave the result in i64
25428 // chunks, thus directly computes the pop count for v2i64 and v4i64.
25429 if (EltVT == MVT::i64) {
25430 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
25431 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
25432 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
25433 return DAG.getBitcast(VT, V);
25436 if (EltVT == MVT::i32) {
25437 // We unpack the low half and high half into i32s interleaved with zeros so
25438 // that we can use PSADBW to horizontally sum them. The most useful part of
25439 // this is that it lines up the results of two PSADBW instructions to be
25440 // two v2i64 vectors which concatenated are the 4 population counts. We can
25441 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
25442 SDValue Zeros = DAG.getConstant(0, DL, VT);
25443 SDValue V32 = DAG.getBitcast(VT, V);
25444 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
25445 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
25447 // Do the horizontal sums into two v2i64s.
25448 Zeros = DAG.getConstant(0, DL, ByteVecVT);
25449 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
25450 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
25451 DAG.getBitcast(ByteVecVT, Low), Zeros);
25452 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
25453 DAG.getBitcast(ByteVecVT, High), Zeros);
25455 // Merge them together.
25456 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
25457 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
25458 DAG.getBitcast(ShortVecVT, Low),
25459 DAG.getBitcast(ShortVecVT, High));
25461 return DAG.getBitcast(VT, V);
25464 // The only element type left is i16.
25465 assert(EltVT == MVT::i16 && "Unknown how to handle type");
25467 // To obtain pop count for each i16 element starting from the pop count for
25468 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
25469 // right by 8. It is important to shift as i16s as i8 vector shift isn't
25470 // directly supported.
25471 SDValue ShifterV = DAG.getConstant(8, DL, VT);
25472 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
25473 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
25474 DAG.getBitcast(ByteVecVT, V));
25475 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
25478 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
25479 const X86Subtarget &Subtarget,
25480 SelectionDAG &DAG) {
25481 MVT VT = Op.getSimpleValueType();
25482 MVT EltVT = VT.getVectorElementType();
25483 int NumElts = VT.getVectorNumElements();
25485 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
25487 // Implement a lookup table in register by using an algorithm based on:
25488 // http://wm.ite.pl/articles/sse-popcount.html
25490 // The general idea is that every lower byte nibble in the input vector is an
25491 // index into a in-register pre-computed pop count table. We then split up the
25492 // input vector in two new ones: (1) a vector with only the shifted-right
25493 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
25494 // masked out higher ones) for each byte. PSHUFB is used separately with both
25495 // to index the in-register table. Next, both are added and the result is a
25496 // i8 vector where each element contains the pop count for input byte.
25497 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
25498 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
25499 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
25500 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
25502 SmallVector<SDValue, 64> LUTVec;
25503 for (int i = 0; i < NumElts; ++i)
25504 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
25505 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
25506 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
25509 SDValue FourV = DAG.getConstant(4, DL, VT);
25510 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
25513 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
25515 // The input vector is used as the shuffle mask that index elements into the
25516 // LUT. After counting low and high nibbles, add the vector to obtain the
25517 // final pop count per i8 element.
25518 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
25519 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
25520 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
25523 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
25524 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
25525 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
25526 SelectionDAG &DAG) {
25527 MVT VT = Op.getSimpleValueType();
25528 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
25529 "Unknown CTPOP type to handle");
25530 SDLoc DL(Op.getNode());
25531 SDValue Op0 = Op.getOperand(0);
25533 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
25534 if (Subtarget.hasVPOPCNTDQ()) {
25535 unsigned NumElems = VT.getVectorNumElements();
25536 assert((VT.getVectorElementType() == MVT::i8 ||
25537 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
25538 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
25539 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
25540 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
25541 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
25542 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
25546 // Decompose 256-bit ops into smaller 128-bit ops.
25547 if (VT.is256BitVector() && !Subtarget.hasInt256())
25548 return Lower256IntUnary(Op, DAG);
25550 // Decompose 512-bit ops into smaller 256-bit ops.
25551 if (VT.is512BitVector() && !Subtarget.hasBWI())
25552 return Lower512IntUnary(Op, DAG);
25554 // For element types greater than i8, do vXi8 pop counts and a bytesum.
25555 if (VT.getScalarType() != MVT::i8) {
25556 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
25557 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
25558 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
25559 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
25562 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
25563 if (!Subtarget.hasSSSE3())
25566 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
25569 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
25570 SelectionDAG &DAG) {
25571 assert(Op.getSimpleValueType().isVector() &&
25572 "We only do custom lowering for vector population count.");
25573 return LowerVectorCTPOP(Op, Subtarget, DAG);
25576 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
25577 MVT VT = Op.getSimpleValueType();
25578 SDValue In = Op.getOperand(0);
25581 // For scalars, its still beneficial to transfer to/from the SIMD unit to
25582 // perform the BITREVERSE.
25583 if (!VT.isVector()) {
25584 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
25585 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
25586 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
25587 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
25588 DAG.getIntPtrConstant(0, DL));
25591 int NumElts = VT.getVectorNumElements();
25592 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
25594 // Decompose 256-bit ops into smaller 128-bit ops.
25595 if (VT.is256BitVector())
25596 return Lower256IntUnary(Op, DAG);
25598 assert(VT.is128BitVector() &&
25599 "Only 128-bit vector bitreverse lowering supported.");
25601 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
25602 // perform the BSWAP in the shuffle.
25603 // Its best to shuffle using the second operand as this will implicitly allow
25604 // memory folding for multiple vectors.
25605 SmallVector<SDValue, 16> MaskElts;
25606 for (int i = 0; i != NumElts; ++i) {
25607 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
25608 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
25609 int PermuteByte = SourceByte | (2 << 5);
25610 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
25614 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
25615 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
25616 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
25618 return DAG.getBitcast(VT, Res);
25621 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
25622 SelectionDAG &DAG) {
25623 MVT VT = Op.getSimpleValueType();
25625 if (Subtarget.hasXOP() && !VT.is512BitVector())
25626 return LowerBITREVERSE_XOP(Op, DAG);
25628 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
25630 SDValue In = Op.getOperand(0);
25633 unsigned NumElts = VT.getVectorNumElements();
25634 assert(VT.getScalarType() == MVT::i8 &&
25635 "Only byte vector BITREVERSE supported");
25637 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
25638 if (VT.is256BitVector() && !Subtarget.hasInt256())
25639 return Lower256IntUnary(Op, DAG);
25641 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
25642 // two nibbles and a PSHUFB lookup to find the bitreverse of each
25643 // 0-15 value (moved to the other nibble).
25644 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
25645 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
25646 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
25648 const int LoLUT[16] = {
25649 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
25650 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
25651 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
25652 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
25653 const int HiLUT[16] = {
25654 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
25655 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
25656 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
25657 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
25659 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
25660 for (unsigned i = 0; i < NumElts; ++i) {
25661 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
25662 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
25665 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
25666 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
25667 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
25668 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
25669 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
25672 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
25673 const X86Subtarget &Subtarget) {
25674 unsigned NewOpc = 0;
25675 switch (N->getOpcode()) {
25676 case ISD::ATOMIC_LOAD_ADD:
25677 NewOpc = X86ISD::LADD;
25679 case ISD::ATOMIC_LOAD_SUB:
25680 NewOpc = X86ISD::LSUB;
25682 case ISD::ATOMIC_LOAD_OR:
25683 NewOpc = X86ISD::LOR;
25685 case ISD::ATOMIC_LOAD_XOR:
25686 NewOpc = X86ISD::LXOR;
25688 case ISD::ATOMIC_LOAD_AND:
25689 NewOpc = X86ISD::LAND;
25692 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
25695 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
25697 return DAG.getMemIntrinsicNode(
25698 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
25699 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
25700 /*MemVT=*/N->getSimpleValueType(0), MMO);
25703 /// Lower atomic_load_ops into LOCK-prefixed operations.
25704 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
25705 const X86Subtarget &Subtarget) {
25706 SDValue Chain = N->getOperand(0);
25707 SDValue LHS = N->getOperand(1);
25708 SDValue RHS = N->getOperand(2);
25709 unsigned Opc = N->getOpcode();
25710 MVT VT = N->getSimpleValueType(0);
25713 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
25714 // can only be lowered when the result is unused. They should have already
25715 // been transformed into a cmpxchg loop in AtomicExpand.
25716 if (N->hasAnyUseOfValue(0)) {
25717 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
25718 // select LXADD if LOCK_SUB can't be selected.
25719 if (Opc == ISD::ATOMIC_LOAD_SUB) {
25720 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
25721 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
25722 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
25723 RHS, AN->getMemOperand());
25725 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
25726 "Used AtomicRMW ops other than Add should have been expanded!");
25730 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
25731 // RAUW the chain, but don't worry about the result, as it's unused.
25732 assert(!N->hasAnyUseOfValue(0));
25733 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
25737 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
25738 SDNode *Node = Op.getNode();
25740 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
25742 // Convert seq_cst store -> xchg
25743 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
25744 // FIXME: On 32-bit, store -> fist or movq would be more efficient
25745 // (The only way to get a 16-byte store is cmpxchg16b)
25746 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
25747 if (cast<AtomicSDNode>(Node)->getOrdering() ==
25748 AtomicOrdering::SequentiallyConsistent ||
25749 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
25750 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
25751 cast<AtomicSDNode>(Node)->getMemoryVT(),
25752 Node->getOperand(0),
25753 Node->getOperand(1), Node->getOperand(2),
25754 cast<AtomicSDNode>(Node)->getMemOperand());
25755 return Swap.getValue(1);
25757 // Other atomic stores have a simple pattern.
25761 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
25762 SDNode *N = Op.getNode();
25763 MVT VT = N->getSimpleValueType(0);
25765 // Let legalize expand this if it isn't a legal type yet.
25766 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25769 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
25772 // Set the carry flag.
25773 SDValue Carry = Op.getOperand(2);
25774 EVT CarryVT = Carry.getValueType();
25775 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
25776 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
25777 Carry, DAG.getConstant(NegOne, DL, CarryVT));
25779 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
25780 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
25781 Op.getOperand(1), Carry.getValue(1));
25783 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
25784 if (N->getValueType(1) == MVT::i1)
25785 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
25787 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
25790 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
25791 SelectionDAG &DAG) {
25792 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
25794 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
25795 // which returns the values as { float, float } (in XMM0) or
25796 // { double, double } (which is returned in XMM0, XMM1).
25798 SDValue Arg = Op.getOperand(0);
25799 EVT ArgVT = Arg.getValueType();
25800 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25802 TargetLowering::ArgListTy Args;
25803 TargetLowering::ArgListEntry Entry;
25807 Entry.IsSExt = false;
25808 Entry.IsZExt = false;
25809 Args.push_back(Entry);
25811 bool isF64 = ArgVT == MVT::f64;
25812 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
25813 // the small struct {f32, f32} is returned in (eax, edx). For f64,
25814 // the results are returned via SRet in memory.
25815 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25816 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
25817 const char *LibcallName = TLI.getLibcallName(LC);
25819 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
25821 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
25822 : (Type *)VectorType::get(ArgTy, 4);
25824 TargetLowering::CallLoweringInfo CLI(DAG);
25825 CLI.setDebugLoc(dl)
25826 .setChain(DAG.getEntryNode())
25827 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
25829 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
25832 // Returned in xmm0 and xmm1.
25833 return CallResult.first;
25835 // Returned in bits 0:31 and 32:64 xmm0.
25836 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
25837 CallResult.first, DAG.getIntPtrConstant(0, dl));
25838 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
25839 CallResult.first, DAG.getIntPtrConstant(1, dl));
25840 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
25841 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
25844 /// Widen a vector input to a vector of NVT. The
25845 /// input vector must have the same element type as NVT.
25846 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
25847 bool FillWithZeroes = false) {
25848 // Check if InOp already has the right width.
25849 MVT InVT = InOp.getSimpleValueType();
25853 if (InOp.isUndef())
25854 return DAG.getUNDEF(NVT);
25856 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
25857 "input and widen element type must match");
25859 unsigned InNumElts = InVT.getVectorNumElements();
25860 unsigned WidenNumElts = NVT.getVectorNumElements();
25861 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
25862 "Unexpected request for vector widening");
25865 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
25866 InOp.getNumOperands() == 2) {
25867 SDValue N1 = InOp.getOperand(1);
25868 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
25870 InOp = InOp.getOperand(0);
25871 InVT = InOp.getSimpleValueType();
25872 InNumElts = InVT.getVectorNumElements();
25875 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
25876 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
25877 SmallVector<SDValue, 16> Ops;
25878 for (unsigned i = 0; i < InNumElts; ++i)
25879 Ops.push_back(InOp.getOperand(i));
25881 EVT EltVT = InOp.getOperand(0).getValueType();
25883 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
25884 DAG.getUNDEF(EltVT);
25885 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
25886 Ops.push_back(FillVal);
25887 return DAG.getBuildVector(NVT, dl, Ops);
25889 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
25891 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
25892 InOp, DAG.getIntPtrConstant(0, dl));
25895 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
25896 SelectionDAG &DAG) {
25897 assert(Subtarget.hasAVX512() &&
25898 "MGATHER/MSCATTER are supported on AVX-512 arch only");
25900 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
25901 SDValue Src = N->getValue();
25902 MVT VT = Src.getSimpleValueType();
25903 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
25906 SDValue Scale = N->getScale();
25907 SDValue Index = N->getIndex();
25908 SDValue Mask = N->getMask();
25909 SDValue Chain = N->getChain();
25910 SDValue BasePtr = N->getBasePtr();
25912 if (VT == MVT::v2f32) {
25913 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25914 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25915 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25916 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25917 DAG.getUNDEF(MVT::v2f32));
25918 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25919 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25920 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25921 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25922 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25923 return SDValue(NewScatter.getNode(), 1);
25928 if (VT == MVT::v2i32) {
25929 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25930 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
25931 DAG.getUNDEF(MVT::v2i32));
25932 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25933 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25934 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25935 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25936 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25937 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25938 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25939 return SDValue(NewScatter.getNode(), 1);
25941 // Custom widen all the operands to avoid promotion.
25942 EVT NewIndexVT = EVT::getVectorVT(
25943 *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
25944 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25945 DAG.getUNDEF(Index.getValueType()));
25946 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25947 DAG.getConstant(0, dl, MVT::v2i1));
25948 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25949 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
25950 Ops, N->getMemOperand());
25953 MVT IndexVT = Index.getSimpleValueType();
25954 MVT MaskVT = Mask.getSimpleValueType();
25956 // If the index is v2i32, we're being called by type legalization and we
25957 // should just let the default handling take care of it.
25958 if (IndexVT == MVT::v2i32)
25961 // If we don't have VLX and neither the passthru or index is 512-bits, we
25962 // need to widen until one is.
25963 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
25964 !Index.getSimpleValueType().is512BitVector()) {
25965 // Determine how much we need to widen by to get a 512-bit type.
25966 unsigned Factor = std::min(512/VT.getSizeInBits(),
25967 512/IndexVT.getSizeInBits());
25968 unsigned NumElts = VT.getVectorNumElements() * Factor;
25970 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25971 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25972 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25974 Src = ExtendToType(Src, VT, DAG);
25975 Index = ExtendToType(Index, IndexVT, DAG);
25976 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25979 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
25980 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25981 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25982 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25983 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25984 return SDValue(NewScatter.getNode(), 1);
25987 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
25988 SelectionDAG &DAG) {
25990 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
25991 MVT VT = Op.getSimpleValueType();
25992 MVT ScalarVT = VT.getScalarType();
25993 SDValue Mask = N->getMask();
25996 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
25997 "Expanding masked load is supported on AVX-512 target only!");
25999 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
26000 "Expanding masked load is supported for 32 and 64-bit types only!");
26002 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
26003 "Cannot lower masked load op.");
26005 assert((ScalarVT.getSizeInBits() >= 32 ||
26006 (Subtarget.hasBWI() &&
26007 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
26008 "Unsupported masked load op.");
26010 // This operation is legal for targets with VLX, but without
26011 // VLX the vector should be widened to 512 bit
26012 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
26013 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
26014 SDValue PassThru = ExtendToType(N->getPassThru(), WideDataVT, DAG);
26016 // Mask element has to be i1.
26017 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
26018 "Unexpected mask type");
26020 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
26022 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
26023 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
26024 N->getBasePtr(), Mask, PassThru,
26025 N->getMemoryVT(), N->getMemOperand(),
26026 N->getExtensionType(),
26027 N->isExpandingLoad());
26029 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
26030 NewLoad.getValue(0),
26031 DAG.getIntPtrConstant(0, dl));
26032 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
26033 return DAG.getMergeValues(RetOps, dl);
26036 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
26037 SelectionDAG &DAG) {
26038 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
26039 SDValue DataToStore = N->getValue();
26040 MVT VT = DataToStore.getSimpleValueType();
26041 MVT ScalarVT = VT.getScalarType();
26042 SDValue Mask = N->getMask();
26045 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
26046 "Expanding masked load is supported on AVX-512 target only!");
26048 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
26049 "Expanding masked load is supported for 32 and 64-bit types only!");
26051 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
26052 "Cannot lower masked store op.");
26054 assert((ScalarVT.getSizeInBits() >= 32 ||
26055 (Subtarget.hasBWI() &&
26056 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
26057 "Unsupported masked store op.");
26059 // This operation is legal for targets with VLX, but without
26060 // VLX the vector should be widened to 512 bit
26061 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
26062 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
26064 // Mask element has to be i1.
26065 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
26066 "Unexpected mask type");
26068 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
26070 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
26071 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
26072 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
26073 Mask, N->getMemoryVT(), N->getMemOperand(),
26074 N->isTruncatingStore(), N->isCompressingStore());
26077 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
26078 SelectionDAG &DAG) {
26079 assert(Subtarget.hasAVX2() &&
26080 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
26082 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
26084 MVT VT = Op.getSimpleValueType();
26085 SDValue Index = N->getIndex();
26086 SDValue Mask = N->getMask();
26087 SDValue PassThru = N->getPassThru();
26088 MVT IndexVT = Index.getSimpleValueType();
26089 MVT MaskVT = Mask.getSimpleValueType();
26091 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
26093 // If the index is v2i32, we're being called by type legalization.
26094 if (IndexVT == MVT::v2i32)
26097 // If we don't have VLX and neither the passthru or index is 512-bits, we
26098 // need to widen until one is.
26100 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
26101 !IndexVT.is512BitVector()) {
26102 // Determine how much we need to widen by to get a 512-bit type.
26103 unsigned Factor = std::min(512/VT.getSizeInBits(),
26104 512/IndexVT.getSizeInBits());
26106 unsigned NumElts = VT.getVectorNumElements() * Factor;
26108 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
26109 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
26110 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
26112 PassThru = ExtendToType(PassThru, VT, DAG);
26113 Index = ExtendToType(Index, IndexVT, DAG);
26114 Mask = ExtendToType(Mask, MaskVT, DAG, true);
26117 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
26119 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
26120 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
26121 N->getMemOperand());
26122 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
26123 NewGather, DAG.getIntPtrConstant(0, dl));
26124 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
26127 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
26128 SelectionDAG &DAG) const {
26129 // TODO: Eventually, the lowering of these nodes should be informed by or
26130 // deferred to the GC strategy for the function in which they appear. For
26131 // now, however, they must be lowered to something. Since they are logically
26132 // no-ops in the case of a null GC strategy (or a GC strategy which does not
26133 // require special handling for these nodes), lower them as literal NOOPs for
26135 SmallVector<SDValue, 2> Ops;
26137 Ops.push_back(Op.getOperand(0));
26138 if (Op->getGluedNode())
26139 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
26142 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
26143 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
26148 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
26149 SelectionDAG &DAG) const {
26150 // TODO: Eventually, the lowering of these nodes should be informed by or
26151 // deferred to the GC strategy for the function in which they appear. For
26152 // now, however, they must be lowered to something. Since they are logically
26153 // no-ops in the case of a null GC strategy (or a GC strategy which does not
26154 // require special handling for these nodes), lower them as literal NOOPs for
26156 SmallVector<SDValue, 2> Ops;
26158 Ops.push_back(Op.getOperand(0));
26159 if (Op->getGluedNode())
26160 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
26163 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
26164 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
26169 /// Provide custom lowering hooks for some operations.
26170 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
26171 switch (Op.getOpcode()) {
26172 default: llvm_unreachable("Should not custom lower this!");
26173 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
26174 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
26175 return LowerCMP_SWAP(Op, Subtarget, DAG);
26176 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
26177 case ISD::ATOMIC_LOAD_ADD:
26178 case ISD::ATOMIC_LOAD_SUB:
26179 case ISD::ATOMIC_LOAD_OR:
26180 case ISD::ATOMIC_LOAD_XOR:
26181 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
26182 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
26183 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
26184 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
26185 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
26186 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
26187 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
26188 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
26189 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
26190 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
26191 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
26192 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
26193 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
26194 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
26195 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
26196 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
26197 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
26198 case ISD::SHL_PARTS:
26199 case ISD::SRA_PARTS:
26200 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
26202 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
26203 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
26204 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
26205 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
26206 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
26207 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
26208 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
26209 case ISD::ZERO_EXTEND_VECTOR_INREG:
26210 case ISD::SIGN_EXTEND_VECTOR_INREG:
26211 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
26212 case ISD::FP_TO_SINT:
26213 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
26214 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
26215 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
26216 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
26218 case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget);
26220 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
26221 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
26222 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
26223 case ISD::SETCC: return LowerSETCC(Op, DAG);
26224 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
26225 case ISD::SELECT: return LowerSELECT(Op, DAG);
26226 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
26227 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
26228 case ISD::VASTART: return LowerVASTART(Op, DAG);
26229 case ISD::VAARG: return LowerVAARG(Op, DAG);
26230 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
26231 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
26232 case ISD::INTRINSIC_VOID:
26233 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
26234 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
26235 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
26236 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
26237 case ISD::FRAME_TO_ARGS_OFFSET:
26238 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
26239 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
26240 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
26241 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
26242 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
26243 case ISD::EH_SJLJ_SETUP_DISPATCH:
26244 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
26245 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
26246 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
26247 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
26249 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
26251 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
26252 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
26254 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
26256 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
26259 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
26265 case ISD::UMULO: return LowerXALUO(Op, DAG);
26266 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
26267 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
26268 case ISD::ADDCARRY:
26269 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
26271 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
26275 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG);
26279 case ISD::UMIN: return LowerMINMAX(Op, DAG);
26280 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
26281 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
26282 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
26283 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
26284 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
26285 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
26286 case ISD::GC_TRANSITION_START:
26287 return LowerGC_TRANSITION_START(Op, DAG);
26288 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
26292 /// Places new result values for the node in Results (their number
26293 /// and types must exactly match those of the original return values of
26294 /// the node), or leaves Results empty, which indicates that the node is not
26295 /// to be custom lowered after all.
26296 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
26297 SmallVectorImpl<SDValue> &Results,
26298 SelectionDAG &DAG) const {
26299 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
26301 if (!Res.getNode())
26304 assert((N->getNumValues() <= Res->getNumValues()) &&
26305 "Lowering returned the wrong number of results!");
26307 // Places new result values base on N result number.
26308 // In some cases (LowerSINT_TO_FP for example) Res has more result values
26309 // than original node, chain should be dropped(last value).
26310 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
26311 Results.push_back(Res.getValue(I));
26314 /// Replace a node with an illegal result type with a new node built out of
26316 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
26317 SmallVectorImpl<SDValue>&Results,
26318 SelectionDAG &DAG) const {
26320 switch (N->getOpcode()) {
26322 llvm_unreachable("Do not know how to custom type legalize this operation!");
26324 EVT VT = N->getValueType(0);
26325 assert(VT.isVector() && "Unexpected VT");
26326 if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
26327 VT.getVectorNumElements() == 2) {
26328 // Promote to a pattern that will be turned into PMULUDQ.
26329 SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
26331 SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
26333 SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
26334 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
26335 } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
26336 VT.getVectorElementType() == MVT::i8) {
26337 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
26338 // elements are needed.
26339 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
26340 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
26341 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
26342 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
26343 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
26344 unsigned NumConcats = 16 / VT.getVectorNumElements();
26345 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
26346 ConcatOps[0] = Res;
26347 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
26348 Results.push_back(Res);
26356 case X86ISD::VPMADDWD:
26357 case X86ISD::AVG: {
26358 // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
26359 // X86ISD::AVG/VPMADDWD by widening.
26360 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
26362 EVT VT = N->getValueType(0);
26363 EVT InVT = N->getOperand(0).getValueType();
26364 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
26365 "Expected a VT that divides into 128 bits.");
26366 unsigned NumConcat = 128 / InVT.getSizeInBits();
26368 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
26369 InVT.getVectorElementType(),
26370 NumConcat * InVT.getVectorNumElements());
26371 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
26372 VT.getVectorElementType(),
26373 NumConcat * VT.getVectorNumElements());
26375 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
26376 Ops[0] = N->getOperand(0);
26377 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
26378 Ops[0] = N->getOperand(1);
26379 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
26381 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
26382 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
26383 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
26384 DAG.getIntPtrConstant(0, dl));
26385 Results.push_back(Res);
26389 // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
26390 // setCC result type is v2i1 because type legalzation will end up with
26391 // a v4i1 setcc plus an extend.
26392 assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
26393 if (N->getOperand(0).getValueType() != MVT::v2f32 ||
26394 getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
26396 SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
26397 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
26398 N->getOperand(0), UNDEF);
26399 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
26400 N->getOperand(1), UNDEF);
26401 SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
26403 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
26404 DAG.getIntPtrConstant(0, dl));
26405 Results.push_back(Res);
26408 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
26409 case X86ISD::FMINC:
26411 case X86ISD::FMAXC:
26412 case X86ISD::FMAX: {
26413 EVT VT = N->getValueType(0);
26414 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
26415 SDValue UNDEF = DAG.getUNDEF(VT);
26416 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
26417 N->getOperand(0), UNDEF);
26418 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
26419 N->getOperand(1), UNDEF);
26420 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
26427 EVT VT = N->getValueType(0);
26428 if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
26429 // If this RHS is a constant splat vector we can widen this and let
26430 // division/remainder by constant optimize it.
26431 // TODO: Can we do something for non-splat?
26433 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
26434 unsigned NumConcats = 128 / VT.getSizeInBits();
26435 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
26436 Ops0[0] = N->getOperand(0);
26437 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
26438 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
26439 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
26440 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
26441 Results.push_back(Res);
26446 if (VT == MVT::v2i32) {
26447 // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
26448 // v2i64 and unroll later. But then we create i64 scalar ops which
26449 // might be slow in 64-bit mode or require a libcall in 32-bit mode.
26450 Results.push_back(DAG.UnrollVectorOp(N));
26460 case ISD::UDIVREM: {
26461 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
26462 Results.push_back(V);
26465 case ISD::TRUNCATE: {
26466 MVT VT = N->getSimpleValueType(0);
26467 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
26470 // The generic legalizer will try to widen the input type to the same
26471 // number of elements as the widened result type. But this isn't always
26472 // the best thing so do some custom legalization to avoid some cases.
26473 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
26474 SDValue In = N->getOperand(0);
26475 EVT InVT = In.getValueType();
26477 unsigned InBits = InVT.getSizeInBits();
26478 if (128 % InBits == 0) {
26479 // 128 bit and smaller inputs should avoid truncate all together and
26480 // just use a build_vector that will become a shuffle.
26481 // TODO: Widen and use a shuffle directly?
26482 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
26483 EVT EltVT = VT.getVectorElementType();
26484 unsigned WidenNumElts = WidenVT.getVectorNumElements();
26485 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
26486 // Use the original element count so we don't do more scalar opts than
26488 unsigned MinElts = VT.getVectorNumElements();
26489 for (unsigned i=0; i < MinElts; ++i) {
26490 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
26491 DAG.getIntPtrConstant(i, dl));
26492 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
26494 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
26497 // With AVX512 there are some cases that can use a target specific
26498 // truncate node to go from 256/512 to less than 128 with zeros in the
26499 // upper elements of the 128 bit result.
26500 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
26501 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
26502 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
26503 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
26506 // There's one case we can widen to 512 bits and use VTRUNC.
26507 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
26508 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
26509 DAG.getUNDEF(MVT::v4i64));
26510 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
26516 case ISD::SIGN_EXTEND_VECTOR_INREG: {
26517 if (ExperimentalVectorWideningLegalization)
26520 EVT VT = N->getValueType(0);
26521 SDValue In = N->getOperand(0);
26522 EVT InVT = In.getValueType();
26523 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
26524 (InVT == MVT::v16i16 || InVT == MVT::v32i8)) {
26525 // Custom split this so we can extend i8/i16->i32 invec. This is better
26526 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
26527 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
26528 // we allow the sra from the extend to i32 to be shared by the split.
26529 EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
26530 InVT.getVectorElementType(),
26531 InVT.getVectorNumElements() / 2);
26532 MVT ExtendVT = MVT::getVectorVT(MVT::i32,
26533 VT.getVectorNumElements());
26534 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
26535 In, DAG.getIntPtrConstant(0, dl));
26536 In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);
26538 // Fill a vector with sign bits for each element.
26539 SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
26540 SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);
26543 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
26545 // Create an unpackl and unpackh to interleave the sign bits then bitcast
26547 SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
26548 Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
26549 SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
26550 Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
26552 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26553 Results.push_back(Res);
26558 case ISD::SIGN_EXTEND:
26559 case ISD::ZERO_EXTEND: {
26560 if (!ExperimentalVectorWideningLegalization)
26563 EVT VT = N->getValueType(0);
26564 SDValue In = N->getOperand(0);
26565 EVT InVT = In.getValueType();
26566 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
26567 (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
26568 // Custom split this so we can extend i8/i16->i32 invec. This is better
26569 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
26570 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
26571 // we allow the sra from the extend to i32 to be shared by the split.
26572 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
26574 // Fill a vector with sign bits for each element.
26575 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
26576 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
26578 // Create an unpackl and unpackh to interleave the sign bits then bitcast
26580 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
26582 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
26583 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
26585 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
26587 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26588 Results.push_back(Res);
26592 if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) {
26593 // Perform custom splitting instead of the two stage extend we would get
26596 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
26597 assert(isTypeLegal(LoVT) && "Split VT not legal?");
26599 bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
26601 SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG);
26603 // We need to shift the input over by half the number of elements.
26604 unsigned NumElts = InVT.getVectorNumElements();
26605 unsigned HalfNumElts = NumElts / 2;
26606 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
26607 for (unsigned i = 0; i != HalfNumElts; ++i)
26608 ShufMask[i] = i + HalfNumElts;
26610 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
26611 Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG);
26613 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26614 Results.push_back(Res);
26618 case ISD::FP_TO_SINT:
26619 case ISD::FP_TO_UINT: {
26620 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
26621 EVT VT = N->getValueType(0);
26622 SDValue Src = N->getOperand(0);
26623 EVT SrcVT = Src.getValueType();
26625 // Promote these manually to avoid over promotion to v2i64. Type
26626 // legalization will revisit the v2i32 operation for more cleanup.
26627 if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
26628 getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
26629 // AVX512DQ provides instructions that produce a v2i64 result.
26630 if (Subtarget.hasDQI())
26633 SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
26634 Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
26636 dl, MVT::v2i32, Res,
26637 DAG.getValueType(VT.getVectorElementType()));
26638 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
26639 Results.push_back(Res);
26643 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
26644 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
26647 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
26648 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
26649 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
26650 VT.getVectorNumElements());
26651 SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
26653 // Preserve what we know about the size of the original result. Except
26654 // when the result is v2i32 since we can't widen the assert.
26655 if (PromoteVT != MVT::v2i32)
26656 Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
26658 dl, PromoteVT, Res,
26659 DAG.getValueType(VT.getVectorElementType()));
26661 // Truncate back to the original width.
26662 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
26664 // Now widen to 128 bits.
26665 unsigned NumConcats = 128 / VT.getSizeInBits();
26666 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
26667 VT.getVectorNumElements() * NumConcats);
26668 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
26669 ConcatOps[0] = Res;
26670 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
26671 Results.push_back(Res);
26676 if (VT == MVT::v2i32) {
26677 assert((IsSigned || Subtarget.hasAVX512()) &&
26678 "Can only handle signed conversion without AVX512");
26679 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
26681 getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
26682 if (Src.getValueType() == MVT::v2f64) {
26683 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
26684 if (!IsSigned && !Subtarget.hasVLX()) {
26685 // If v2i32 is widened, we can defer to the generic legalizer.
26688 // Custom widen by doubling to a legal vector with. Isel will
26689 // further widen to v8f64.
26690 Opc = ISD::FP_TO_UINT;
26691 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
26692 Src, DAG.getUNDEF(MVT::v2f64));
26694 SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
26696 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
26697 DAG.getIntPtrConstant(0, dl));
26698 Results.push_back(Res);
26701 if (SrcVT == MVT::v2f32 &&
26702 getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
26703 SDValue Idx = DAG.getIntPtrConstant(0, dl);
26704 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
26705 DAG.getUNDEF(MVT::v2f32));
26706 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
26707 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
26708 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
26709 Results.push_back(Res);
26713 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
26714 // so early out here.
26718 if (Subtarget.hasDQI() && VT == MVT::i64 &&
26719 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
26720 assert(!Subtarget.is64Bit() && "i64 should be legal");
26721 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
26722 // Using a 256-bit input here to guarantee 128-bit input for f32 case.
26723 // TODO: Use 128-bit vectors for f64 case?
26724 // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
26725 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
26726 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
26728 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
26729 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
26730 DAG.getConstantFP(0.0, dl, VecInVT), Src,
26732 Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
26733 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
26734 Results.push_back(Res);
26738 std::pair<SDValue,SDValue> Vals =
26739 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
26740 SDValue FIST = Vals.first, StackSlot = Vals.second;
26741 if (FIST.getNode()) {
26742 // Return a load from the stack slot.
26743 if (StackSlot.getNode())
26745 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
26747 Results.push_back(FIST);
26751 case ISD::SINT_TO_FP: {
26752 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
26753 SDValue Src = N->getOperand(0);
26754 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
26756 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
26759 case ISD::UINT_TO_FP: {
26760 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
26761 EVT VT = N->getValueType(0);
26762 if (VT != MVT::v2f32)
26764 SDValue Src = N->getOperand(0);
26765 EVT SrcVT = Src.getValueType();
26766 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
26767 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
26770 if (SrcVT != MVT::v2i32)
26772 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
26774 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
26775 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
26776 DAG.getBitcast(MVT::v2i64, VBias));
26777 Or = DAG.getBitcast(MVT::v2f64, Or);
26778 // TODO: Are there any fast-math-flags to propagate here?
26779 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
26780 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
26783 case ISD::FP_ROUND: {
26784 if (!isTypeLegal(N->getOperand(0).getValueType()))
26786 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
26787 Results.push_back(V);
26790 case ISD::FP_EXTEND: {
26791 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
26792 // No other ValueType for FP_EXTEND should reach this point.
26793 assert(N->getValueType(0) == MVT::v2f32 &&
26794 "Do not know how to legalize this Node");
26797 case ISD::INTRINSIC_W_CHAIN: {
26798 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
26800 default : llvm_unreachable("Do not know how to custom type "
26801 "legalize this intrinsic operation!");
26802 case Intrinsic::x86_rdtsc:
26803 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
26805 case Intrinsic::x86_rdtscp:
26806 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
26808 case Intrinsic::x86_rdpmc:
26809 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
26811 case Intrinsic::x86_xgetbv:
26812 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
26815 case ISD::INTRINSIC_WO_CHAIN: {
26816 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
26817 Results.push_back(V);
26820 case ISD::READCYCLECOUNTER: {
26821 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
26824 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
26825 EVT T = N->getValueType(0);
26826 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
26827 bool Regs64bit = T == MVT::i128;
26828 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
26829 SDValue cpInL, cpInH;
26830 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
26831 DAG.getConstant(0, dl, HalfT));
26832 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
26833 DAG.getConstant(1, dl, HalfT));
26834 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
26835 Regs64bit ? X86::RAX : X86::EAX,
26837 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
26838 Regs64bit ? X86::RDX : X86::EDX,
26839 cpInH, cpInL.getValue(1));
26840 SDValue swapInL, swapInH;
26841 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
26842 DAG.getConstant(0, dl, HalfT));
26843 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
26844 DAG.getConstant(1, dl, HalfT));
26846 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
26847 swapInH, cpInH.getValue(1));
26848 // If the current function needs the base pointer, RBX,
26849 // we shouldn't use cmpxchg directly.
26850 // Indeed the lowering of that instruction will clobber
26851 // that register and since RBX will be a reserved register
26852 // the register allocator will not make sure its value will
26853 // be properly saved and restored around this live-range.
26854 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26856 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26857 unsigned BasePtr = TRI->getBaseRegister();
26858 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
26859 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
26860 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
26861 // ISel prefers the LCMPXCHG64 variant.
26862 // If that assert breaks, that means it is not the case anymore,
26863 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
26864 // not just EBX. This is a matter of accepting i64 input for that
26865 // pseudo, and restoring into the register of the right wide
26866 // in expand pseudo. Everything else should just work.
26867 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
26868 "Saving only half of the RBX");
26869 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
26870 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
26871 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
26872 Regs64bit ? X86::RBX : X86::EBX,
26873 HalfT, swapInH.getValue(1));
26874 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
26876 /*Glue*/ RBXSave.getValue(2)};
26877 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
26880 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
26881 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
26882 Regs64bit ? X86::RBX : X86::EBX, swapInL,
26883 swapInH.getValue(1));
26884 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
26885 swapInL.getValue(1)};
26886 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
26888 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
26889 Regs64bit ? X86::RAX : X86::EAX,
26890 HalfT, Result.getValue(1));
26891 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
26892 Regs64bit ? X86::RDX : X86::EDX,
26893 HalfT, cpOutL.getValue(2));
26894 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
26896 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
26897 MVT::i32, cpOutH.getValue(2));
26898 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
26899 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
26901 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
26902 Results.push_back(Success);
26903 Results.push_back(EFLAGS.getValue(1));
26906 case ISD::ATOMIC_SWAP:
26907 case ISD::ATOMIC_LOAD_ADD:
26908 case ISD::ATOMIC_LOAD_SUB:
26909 case ISD::ATOMIC_LOAD_AND:
26910 case ISD::ATOMIC_LOAD_OR:
26911 case ISD::ATOMIC_LOAD_XOR:
26912 case ISD::ATOMIC_LOAD_NAND:
26913 case ISD::ATOMIC_LOAD_MIN:
26914 case ISD::ATOMIC_LOAD_MAX:
26915 case ISD::ATOMIC_LOAD_UMIN:
26916 case ISD::ATOMIC_LOAD_UMAX:
26917 case ISD::ATOMIC_LOAD: {
26918 // Delegate to generic TypeLegalization. Situations we can really handle
26919 // should have already been dealt with by AtomicExpandPass.cpp.
26922 case ISD::BITCAST: {
26923 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
26924 EVT DstVT = N->getValueType(0);
26925 EVT SrcVT = N->getOperand(0).getValueType();
26927 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
26928 // we can split using the k-register rather than memory.
26929 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
26930 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
26932 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
26933 Lo = DAG.getBitcast(MVT::i32, Lo);
26934 Hi = DAG.getBitcast(MVT::i32, Hi);
26935 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
26936 Results.push_back(Res);
26940 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
26941 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
26942 SrcVT.isVector() && isTypeLegal(SrcVT)) {
26944 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
26945 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
26946 Lo = DAG.getBitcast(CastVT, Lo);
26947 Hi = DAG.getBitcast(CastVT, Hi);
26948 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
26949 Results.push_back(Res);
26953 if (SrcVT != MVT::f64 ||
26954 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
26955 getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
26958 unsigned NumElts = DstVT.getVectorNumElements();
26959 EVT SVT = DstVT.getVectorElementType();
26960 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
26962 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
26963 Res = DAG.getBitcast(WiderVT, Res);
26964 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
26965 DAG.getIntPtrConstant(0, dl));
26966 Results.push_back(Res);
26969 case ISD::MGATHER: {
26970 EVT VT = N->getValueType(0);
26971 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
26972 auto *Gather = cast<MaskedGatherSDNode>(N);
26973 SDValue Index = Gather->getIndex();
26974 if (Index.getValueType() != MVT::v2i64)
26976 SDValue Mask = Gather->getMask();
26977 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
26978 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
26979 Gather->getPassThru(),
26980 DAG.getUNDEF(MVT::v2f32));
26981 if (!Subtarget.hasVLX()) {
26982 // We need to widen the mask, but the instruction will only use 2
26983 // of its elements. So we can use undef.
26984 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
26985 DAG.getUNDEF(MVT::v2i1));
26986 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
26988 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
26989 Gather->getBasePtr(), Index, Gather->getScale() };
26990 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
26991 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
26992 Gather->getMemoryVT(), Gather->getMemOperand());
26993 Results.push_back(Res);
26994 Results.push_back(Res.getValue(2));
26997 if (VT == MVT::v2i32) {
26998 auto *Gather = cast<MaskedGatherSDNode>(N);
26999 SDValue Index = Gather->getIndex();
27000 SDValue Mask = Gather->getMask();
27001 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
27002 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
27003 Gather->getPassThru(),
27004 DAG.getUNDEF(MVT::v2i32));
27005 // If the index is v2i64 we can use it directly.
27006 if (Index.getValueType() == MVT::v2i64 &&
27007 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
27008 if (!Subtarget.hasVLX()) {
27009 // We need to widen the mask, but the instruction will only use 2
27010 // of its elements. So we can use undef.
27011 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
27012 DAG.getUNDEF(MVT::v2i1));
27013 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
27015 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
27016 Gather->getBasePtr(), Index, Gather->getScale() };
27017 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
27018 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
27019 Gather->getMemoryVT(), Gather->getMemOperand());
27020 SDValue Chain = Res.getValue(2);
27021 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
27022 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
27023 DAG.getIntPtrConstant(0, dl));
27024 Results.push_back(Res);
27025 Results.push_back(Chain);
27028 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
27029 EVT IndexVT = Index.getValueType();
27030 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
27031 IndexVT.getScalarType(), 4);
27032 // Otherwise we need to custom widen everything to avoid promotion.
27033 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
27034 DAG.getUNDEF(IndexVT));
27035 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
27036 DAG.getConstant(0, dl, MVT::v2i1));
27037 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
27038 Gather->getBasePtr(), Index, Gather->getScale() };
27039 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
27040 Gather->getMemoryVT(), dl, Ops,
27041 Gather->getMemOperand());
27042 SDValue Chain = Res.getValue(1);
27043 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
27044 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
27045 DAG.getIntPtrConstant(0, dl));
27046 Results.push_back(Res);
27047 Results.push_back(Chain);
27054 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
27055 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
27056 // cast since type legalization will try to use an i64 load.
27057 MVT VT = N->getSimpleValueType(0);
27058 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
27059 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
27061 if (!ISD::isNON_EXTLoad(N))
27063 auto *Ld = cast<LoadSDNode>(N);
27064 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
27065 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
27066 Ld->getPointerInfo(),
27067 Ld->getAlignment(),
27068 Ld->getMemOperand()->getFlags());
27069 SDValue Chain = Res.getValue(1);
27070 MVT WideVT = MVT::getVectorVT(LdVT, 2);
27071 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
27072 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
27073 VT.getVectorNumElements() * 2);
27074 Res = DAG.getBitcast(CastVT, Res);
27075 Results.push_back(Res);
27076 Results.push_back(Chain);
27082 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
27083 switch ((X86ISD::NodeType)Opcode) {
27084 case X86ISD::FIRST_NUMBER: break;
27085 case X86ISD::BSF: return "X86ISD::BSF";
27086 case X86ISD::BSR: return "X86ISD::BSR";
27087 case X86ISD::SHLD: return "X86ISD::SHLD";
27088 case X86ISD::SHRD: return "X86ISD::SHRD";
27089 case X86ISD::FAND: return "X86ISD::FAND";
27090 case X86ISD::FANDN: return "X86ISD::FANDN";
27091 case X86ISD::FOR: return "X86ISD::FOR";
27092 case X86ISD::FXOR: return "X86ISD::FXOR";
27093 case X86ISD::FILD: return "X86ISD::FILD";
27094 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
27095 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
27096 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
27097 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
27098 case X86ISD::FLD: return "X86ISD::FLD";
27099 case X86ISD::FST: return "X86ISD::FST";
27100 case X86ISD::CALL: return "X86ISD::CALL";
27101 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
27102 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
27103 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
27104 case X86ISD::BT: return "X86ISD::BT";
27105 case X86ISD::CMP: return "X86ISD::CMP";
27106 case X86ISD::COMI: return "X86ISD::COMI";
27107 case X86ISD::UCOMI: return "X86ISD::UCOMI";
27108 case X86ISD::CMPM: return "X86ISD::CMPM";
27109 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
27110 case X86ISD::SETCC: return "X86ISD::SETCC";
27111 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
27112 case X86ISD::FSETCC: return "X86ISD::FSETCC";
27113 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
27114 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
27115 case X86ISD::CMOV: return "X86ISD::CMOV";
27116 case X86ISD::BRCOND: return "X86ISD::BRCOND";
27117 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
27118 case X86ISD::IRET: return "X86ISD::IRET";
27119 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
27120 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
27121 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
27122 case X86ISD::Wrapper: return "X86ISD::Wrapper";
27123 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
27124 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
27125 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
27126 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
27127 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
27128 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
27129 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
27130 case X86ISD::PINSRB: return "X86ISD::PINSRB";
27131 case X86ISD::PINSRW: return "X86ISD::PINSRW";
27132 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
27133 case X86ISD::ANDNP: return "X86ISD::ANDNP";
27134 case X86ISD::BLENDI: return "X86ISD::BLENDI";
27135 case X86ISD::BLENDV: return "X86ISD::BLENDV";
27136 case X86ISD::HADD: return "X86ISD::HADD";
27137 case X86ISD::HSUB: return "X86ISD::HSUB";
27138 case X86ISD::FHADD: return "X86ISD::FHADD";
27139 case X86ISD::FHSUB: return "X86ISD::FHSUB";
27140 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
27141 case X86ISD::FMAX: return "X86ISD::FMAX";
27142 case X86ISD::FMAXS: return "X86ISD::FMAXS";
27143 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
27144 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
27145 case X86ISD::FMIN: return "X86ISD::FMIN";
27146 case X86ISD::FMINS: return "X86ISD::FMINS";
27147 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
27148 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
27149 case X86ISD::FMAXC: return "X86ISD::FMAXC";
27150 case X86ISD::FMINC: return "X86ISD::FMINC";
27151 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
27152 case X86ISD::FRCP: return "X86ISD::FRCP";
27153 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
27154 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
27155 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
27156 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
27157 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
27158 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
27159 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
27160 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
27161 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
27162 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
27163 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
27164 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
27165 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
27166 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
27167 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
27168 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
27169 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
27170 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
27171 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
27172 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
27173 case X86ISD::LADD: return "X86ISD::LADD";
27174 case X86ISD::LSUB: return "X86ISD::LSUB";
27175 case X86ISD::LOR: return "X86ISD::LOR";
27176 case X86ISD::LXOR: return "X86ISD::LXOR";
27177 case X86ISD::LAND: return "X86ISD::LAND";
27178 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
27179 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
27180 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
27181 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
27182 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
27183 case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";
27184 case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";
27185 case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";
27186 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
27187 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
27188 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
27189 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
27190 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
27191 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
27192 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
27193 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
27194 case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
27195 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
27196 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
27197 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
27198 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
27199 case X86ISD::VSHL: return "X86ISD::VSHL";
27200 case X86ISD::VSRL: return "X86ISD::VSRL";
27201 case X86ISD::VSRA: return "X86ISD::VSRA";
27202 case X86ISD::VSHLI: return "X86ISD::VSHLI";
27203 case X86ISD::VSRLI: return "X86ISD::VSRLI";
27204 case X86ISD::VSRAI: return "X86ISD::VSRAI";
27205 case X86ISD::VSRAV: return "X86ISD::VSRAV";
27206 case X86ISD::VROTLI: return "X86ISD::VROTLI";
27207 case X86ISD::VROTRI: return "X86ISD::VROTRI";
27208 case X86ISD::VPPERM: return "X86ISD::VPPERM";
27209 case X86ISD::CMPP: return "X86ISD::CMPP";
27210 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
27211 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
27212 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
27213 case X86ISD::ADD: return "X86ISD::ADD";
27214 case X86ISD::SUB: return "X86ISD::SUB";
27215 case X86ISD::ADC: return "X86ISD::ADC";
27216 case X86ISD::SBB: return "X86ISD::SBB";
27217 case X86ISD::SMUL: return "X86ISD::SMUL";
27218 case X86ISD::UMUL: return "X86ISD::UMUL";
27219 case X86ISD::OR: return "X86ISD::OR";
27220 case X86ISD::XOR: return "X86ISD::XOR";
27221 case X86ISD::AND: return "X86ISD::AND";
27222 case X86ISD::BEXTR: return "X86ISD::BEXTR";
27223 case X86ISD::BZHI: return "X86ISD::BZHI";
27224 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
27225 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
27226 case X86ISD::PTEST: return "X86ISD::PTEST";
27227 case X86ISD::TESTP: return "X86ISD::TESTP";
27228 case X86ISD::KORTEST: return "X86ISD::KORTEST";
27229 case X86ISD::KTEST: return "X86ISD::KTEST";
27230 case X86ISD::KADD: return "X86ISD::KADD";
27231 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
27232 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
27233 case X86ISD::PACKSS: return "X86ISD::PACKSS";
27234 case X86ISD::PACKUS: return "X86ISD::PACKUS";
27235 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
27236 case X86ISD::VALIGN: return "X86ISD::VALIGN";
27237 case X86ISD::VSHLD: return "X86ISD::VSHLD";
27238 case X86ISD::VSHRD: return "X86ISD::VSHRD";
27239 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
27240 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
27241 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
27242 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
27243 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
27244 case X86ISD::SHUFP: return "X86ISD::SHUFP";
27245 case X86ISD::SHUF128: return "X86ISD::SHUF128";
27246 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
27247 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
27248 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
27249 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
27250 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
27251 case X86ISD::MOVSD: return "X86ISD::MOVSD";
27252 case X86ISD::MOVSS: return "X86ISD::MOVSS";
27253 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
27254 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
27255 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
27256 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
27257 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
27258 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
27259 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
27260 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
27261 case X86ISD::VPERMV: return "X86ISD::VPERMV";
27262 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
27263 case X86ISD::VPERMI: return "X86ISD::VPERMI";
27264 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
27265 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
27266 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
27267 case X86ISD::VRANGE: return "X86ISD::VRANGE";
27268 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
27269 case X86ISD::VRANGES: return "X86ISD::VRANGES";
27270 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
27271 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
27272 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
27273 case X86ISD::PSADBW: return "X86ISD::PSADBW";
27274 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
27275 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
27276 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
27277 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
27278 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
27279 case X86ISD::MFENCE: return "X86ISD::MFENCE";
27280 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
27281 case X86ISD::SAHF: return "X86ISD::SAHF";
27282 case X86ISD::RDRAND: return "X86ISD::RDRAND";
27283 case X86ISD::RDSEED: return "X86ISD::RDSEED";
27284 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
27285 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
27286 case X86ISD::VPSHA: return "X86ISD::VPSHA";
27287 case X86ISD::VPSHL: return "X86ISD::VPSHL";
27288 case X86ISD::VPCOM: return "X86ISD::VPCOM";
27289 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
27290 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
27291 case X86ISD::FMSUB: return "X86ISD::FMSUB";
27292 case X86ISD::FNMADD: return "X86ISD::FNMADD";
27293 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
27294 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
27295 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
27296 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
27297 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
27298 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
27299 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
27300 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
27301 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
27302 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
27303 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
27304 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
27305 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
27306 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
27307 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
27308 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
27309 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
27310 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
27311 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
27312 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
27313 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
27314 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
27315 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
27316 case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
27317 case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
27318 case X86ISD::XTEST: return "X86ISD::XTEST";
27319 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
27320 case X86ISD::EXPAND: return "X86ISD::EXPAND";
27321 case X86ISD::SELECTS: return "X86ISD::SELECTS";
27322 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
27323 case X86ISD::RCP14: return "X86ISD::RCP14";
27324 case X86ISD::RCP14S: return "X86ISD::RCP14S";
27325 case X86ISD::RCP28: return "X86ISD::RCP28";
27326 case X86ISD::RCP28S: return "X86ISD::RCP28S";
27327 case X86ISD::EXP2: return "X86ISD::EXP2";
27328 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
27329 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
27330 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
27331 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
27332 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
27333 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
27334 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
27335 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
27336 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
27337 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
27338 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
27339 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
27340 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
27341 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
27342 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
27343 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
27344 case X86ISD::SCALEF: return "X86ISD::SCALEF";
27345 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
27346 case X86ISD::AVG: return "X86ISD::AVG";
27347 case X86ISD::MULHRS: return "X86ISD::MULHRS";
27348 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
27349 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
27350 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
27351 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
27352 case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
27353 case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
27354 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
27355 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
27356 case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
27357 case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
27358 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
27359 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
27360 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
27361 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
27362 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
27363 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
27364 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
27365 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
27366 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
27367 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
27368 case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
27369 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
27370 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
27371 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
27372 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
27373 case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
27374 case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";
27375 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
27376 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
27377 case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";
27378 case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
27379 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
27380 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
27381 case X86ISD::LWPINS: return "X86ISD::LWPINS";
27382 case X86ISD::MGATHER: return "X86ISD::MGATHER";
27383 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
27384 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
27385 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
27386 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
27387 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
27388 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
27389 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
27390 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
27391 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
27392 case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
27393 case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
27394 case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
27395 case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
27400 /// Return true if the addressing mode represented by AM is legal for this
27401 /// target, for a load/store of the specified type.
27402 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
27403 const AddrMode &AM, Type *Ty,
27405 Instruction *I) const {
27406 // X86 supports extremely general addressing modes.
27407 CodeModel::Model M = getTargetMachine().getCodeModel();
27409 // X86 allows a sign-extended 32-bit immediate field as a displacement.
27410 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
27414 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
27416 // If a reference to this global requires an extra load, we can't fold it.
27417 if (isGlobalStubReference(GVFlags))
27420 // If BaseGV requires a register for the PIC base, we cannot also have a
27421 // BaseReg specified.
27422 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
27425 // If lower 4G is not available, then we must use rip-relative addressing.
27426 if ((M != CodeModel::Small || isPositionIndependent()) &&
27427 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
27431 switch (AM.Scale) {
27437 // These scales always work.
27442 // These scales are formed with basereg+scalereg. Only accept if there is
27447 default: // Other stuff never works.
27454 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
27455 unsigned Bits = Ty->getScalarSizeInBits();
27457 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
27458 // particularly cheaper than those without.
27462 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
27463 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
27464 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
27467 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
27468 // shifts just as cheap as scalar ones.
27469 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
27472 // AVX512BW has shifts such as vpsllvw.
27473 if (Subtarget.hasBWI() && Bits == 16)
27476 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
27477 // fully general vector.
27481 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
27482 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
27484 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
27485 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
27486 return NumBits1 > NumBits2;
27489 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
27490 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
27493 if (!isTypeLegal(EVT::getEVT(Ty1)))
27496 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
27498 // Assuming the caller doesn't have a zeroext or signext return parameter,
27499 // truncation all the way down to i1 is valid.
27503 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
27504 return isInt<32>(Imm);
27507 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
27508 // Can also use sub to handle negated immediates.
27509 return isInt<32>(Imm);
27512 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
27513 return isInt<32>(Imm);
27516 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
27517 if (!VT1.isInteger() || !VT2.isInteger())
27519 unsigned NumBits1 = VT1.getSizeInBits();
27520 unsigned NumBits2 = VT2.getSizeInBits();
27521 return NumBits1 > NumBits2;
27524 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
27525 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
27526 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
27529 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
27530 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
27531 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
27534 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
27535 EVT VT1 = Val.getValueType();
27536 if (isZExtFree(VT1, VT2))
27539 if (Val.getOpcode() != ISD::LOAD)
27542 if (!VT1.isSimple() || !VT1.isInteger() ||
27543 !VT2.isSimple() || !VT2.isInteger())
27546 switch (VT1.getSimpleVT().SimpleTy) {
27551 // X86 has 8, 16, and 32-bit zero-extending loads.
27558 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
27559 EVT SrcVT = ExtVal.getOperand(0).getValueType();
27561 // There is no extending load for vXi1.
27562 if (SrcVT.getScalarType() == MVT::i1)
27569 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
27570 if (!Subtarget.hasAnyFMA())
27573 VT = VT.getScalarType();
27575 if (!VT.isSimple())
27578 switch (VT.getSimpleVT().SimpleTy) {
27589 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
27590 // i16 instructions are longer (0x66 prefix) and potentially slower.
27591 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
27594 /// Targets can use this to indicate that they only support *some*
27595 /// VECTOR_SHUFFLE operations, those with specific masks.
27596 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
27597 /// are assumed to be legal.
27598 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
27599 if (!VT.isSimple())
27602 // Not for i1 vectors
27603 if (VT.getSimpleVT().getScalarType() == MVT::i1)
27606 // Very little shuffling can be done for 64-bit vectors right now.
27607 if (VT.getSimpleVT().getSizeInBits() == 64)
27610 // We only care that the types being shuffled are legal. The lowering can
27611 // handle any possible shuffle mask that results.
27612 return isTypeLegal(VT.getSimpleVT());
27615 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
27617 // Don't convert an 'and' into a shuffle that we don't directly support.
27618 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
27619 if (!Subtarget.hasAVX2())
27620 if (VT == MVT::v32i8 || VT == MVT::v16i16)
27623 // Just delegate to the generic legality, clear masks aren't special.
27624 return isShuffleMaskLegal(Mask, VT);
27627 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
27628 // If the subtarget is using retpolines, we need to not generate jump tables.
27629 if (Subtarget.useRetpolineIndirectBranches())
27632 // Otherwise, fallback on the generic logic.
27633 return TargetLowering::areJTsAllowed(Fn);
27636 //===----------------------------------------------------------------------===//
27637 // X86 Scheduler Hooks
27638 //===----------------------------------------------------------------------===//
27640 /// Utility function to emit xbegin specifying the start of an RTM region.
27641 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
27642 const TargetInstrInfo *TII) {
27643 DebugLoc DL = MI.getDebugLoc();
27645 const BasicBlock *BB = MBB->getBasicBlock();
27646 MachineFunction::iterator I = ++MBB->getIterator();
27648 // For the v = xbegin(), we generate
27657 // eax = # XABORT_DEF
27661 // v = phi(s0/mainBB, s1/fallBB)
27663 MachineBasicBlock *thisMBB = MBB;
27664 MachineFunction *MF = MBB->getParent();
27665 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27666 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
27667 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27668 MF->insert(I, mainMBB);
27669 MF->insert(I, fallMBB);
27670 MF->insert(I, sinkMBB);
27672 // Transfer the remainder of BB and its successor edges to sinkMBB.
27673 sinkMBB->splice(sinkMBB->begin(), MBB,
27674 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27675 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27677 MachineRegisterInfo &MRI = MF->getRegInfo();
27678 unsigned DstReg = MI.getOperand(0).getReg();
27679 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27680 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27681 unsigned fallDstReg = MRI.createVirtualRegister(RC);
27685 // # fallthrough to mainMBB
27686 // # abortion to fallMBB
27687 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
27688 thisMBB->addSuccessor(mainMBB);
27689 thisMBB->addSuccessor(fallMBB);
27692 // mainDstReg := -1
27693 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
27694 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27695 mainMBB->addSuccessor(sinkMBB);
27698 // ; pseudo instruction to model hardware's definition from XABORT
27699 // EAX := XABORT_DEF
27700 // fallDstReg := EAX
27701 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
27702 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
27704 fallMBB->addSuccessor(sinkMBB);
27707 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
27708 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
27709 .addReg(mainDstReg).addMBB(mainMBB)
27710 .addReg(fallDstReg).addMBB(fallMBB);
27712 MI.eraseFromParent();
27716 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
27717 const X86Subtarget &Subtarget) {
27718 DebugLoc dl = MI.getDebugLoc();
27719 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27721 // insert input VAL into EAX
27722 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
27723 .addReg(MI.getOperand(0).getReg());
27724 // insert zero to ECX
27725 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
27727 // insert zero to EDX
27728 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
27730 // insert WRPKRU instruction
27731 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
27733 MI.eraseFromParent(); // The pseudo is gone now.
27737 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
27738 const X86Subtarget &Subtarget) {
27739 DebugLoc dl = MI.getDebugLoc();
27740 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27742 // insert zero to ECX
27743 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
27745 // insert RDPKRU instruction
27746 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
27747 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
27750 MI.eraseFromParent(); // The pseudo is gone now.
27754 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
27755 const X86Subtarget &Subtarget,
27757 DebugLoc dl = MI.getDebugLoc();
27758 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27759 // Address into RAX/EAX, other two args into ECX, EDX.
27760 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
27761 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
27762 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
27763 for (int i = 0; i < X86::AddrNumOperands; ++i)
27764 MIB.add(MI.getOperand(i));
27766 unsigned ValOps = X86::AddrNumOperands;
27767 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
27768 .addReg(MI.getOperand(ValOps).getReg());
27769 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
27770 .addReg(MI.getOperand(ValOps + 1).getReg());
27772 // The instruction doesn't actually take any operands though.
27773 BuildMI(*BB, MI, dl, TII->get(Opc));
27775 MI.eraseFromParent(); // The pseudo is gone now.
27779 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
27780 const X86Subtarget &Subtarget) {
27781 DebugLoc dl = MI->getDebugLoc();
27782 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27783 // Address into RAX/EAX
27784 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
27785 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
27786 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
27787 for (int i = 0; i < X86::AddrNumOperands; ++i)
27788 MIB.add(MI->getOperand(i));
27790 // The instruction doesn't actually take any operands though.
27791 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
27793 MI->eraseFromParent(); // The pseudo is gone now.
27799 MachineBasicBlock *
27800 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
27801 MachineBasicBlock *MBB) const {
27802 // Emit va_arg instruction on X86-64.
27804 // Operands to this pseudo-instruction:
27805 // 0 ) Output : destination address (reg)
27806 // 1-5) Input : va_list address (addr, i64mem)
27807 // 6 ) ArgSize : Size (in bytes) of vararg type
27808 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
27809 // 8 ) Align : Alignment of type
27810 // 9 ) EFLAGS (implicit-def)
27812 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
27813 static_assert(X86::AddrNumOperands == 5,
27814 "VAARG_64 assumes 5 address operands");
27816 unsigned DestReg = MI.getOperand(0).getReg();
27817 MachineOperand &Base = MI.getOperand(1);
27818 MachineOperand &Scale = MI.getOperand(2);
27819 MachineOperand &Index = MI.getOperand(3);
27820 MachineOperand &Disp = MI.getOperand(4);
27821 MachineOperand &Segment = MI.getOperand(5);
27822 unsigned ArgSize = MI.getOperand(6).getImm();
27823 unsigned ArgMode = MI.getOperand(7).getImm();
27824 unsigned Align = MI.getOperand(8).getImm();
27826 // Memory Reference
27827 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
27828 SmallVector<MachineMemOperand *, 1> MMOs(MI.memoperands_begin(),
27829 MI.memoperands_end());
27831 // Machine Information
27832 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27833 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
27834 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
27835 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
27836 DebugLoc DL = MI.getDebugLoc();
27838 // struct va_list {
27841 // i64 overflow_area (address)
27842 // i64 reg_save_area (address)
27844 // sizeof(va_list) = 24
27845 // alignment(va_list) = 8
27847 unsigned TotalNumIntRegs = 6;
27848 unsigned TotalNumXMMRegs = 8;
27849 bool UseGPOffset = (ArgMode == 1);
27850 bool UseFPOffset = (ArgMode == 2);
27851 unsigned MaxOffset = TotalNumIntRegs * 8 +
27852 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
27854 /* Align ArgSize to a multiple of 8 */
27855 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
27856 bool NeedsAlign = (Align > 8);
27858 MachineBasicBlock *thisMBB = MBB;
27859 MachineBasicBlock *overflowMBB;
27860 MachineBasicBlock *offsetMBB;
27861 MachineBasicBlock *endMBB;
27863 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
27864 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
27865 unsigned OffsetReg = 0;
27867 if (!UseGPOffset && !UseFPOffset) {
27868 // If we only pull from the overflow region, we don't create a branch.
27869 // We don't need to alter control flow.
27870 OffsetDestReg = 0; // unused
27871 OverflowDestReg = DestReg;
27873 offsetMBB = nullptr;
27874 overflowMBB = thisMBB;
27877 // First emit code to check if gp_offset (or fp_offset) is below the bound.
27878 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
27879 // If not, pull from overflow_area. (branch to overflowMBB)
27884 // offsetMBB overflowMBB
27889 // Registers for the PHI in endMBB
27890 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
27891 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
27893 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
27894 MachineFunction *MF = MBB->getParent();
27895 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27896 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27897 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27899 MachineFunction::iterator MBBIter = ++MBB->getIterator();
27901 // Insert the new basic blocks
27902 MF->insert(MBBIter, offsetMBB);
27903 MF->insert(MBBIter, overflowMBB);
27904 MF->insert(MBBIter, endMBB);
27906 // Transfer the remainder of MBB and its successor edges to endMBB.
27907 endMBB->splice(endMBB->begin(), thisMBB,
27908 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
27909 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
27911 // Make offsetMBB and overflowMBB successors of thisMBB
27912 thisMBB->addSuccessor(offsetMBB);
27913 thisMBB->addSuccessor(overflowMBB);
27915 // endMBB is a successor of both offsetMBB and overflowMBB
27916 offsetMBB->addSuccessor(endMBB);
27917 overflowMBB->addSuccessor(endMBB);
27919 // Load the offset value into a register
27920 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
27921 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
27925 .addDisp(Disp, UseFPOffset ? 4 : 0)
27929 // Check if there is enough room left to pull this argument.
27930 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
27932 .addImm(MaxOffset + 8 - ArgSizeA8);
27934 // Branch to "overflowMBB" if offset >= max
27935 // Fall through to "offsetMBB" otherwise
27936 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
27937 .addMBB(overflowMBB);
27940 // In offsetMBB, emit code to use the reg_save_area.
27942 assert(OffsetReg != 0);
27944 // Read the reg_save_area address.
27945 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
27946 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
27954 // Zero-extend the offset
27955 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
27956 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
27959 .addImm(X86::sub_32bit);
27961 // Add the offset to the reg_save_area to get the final address.
27962 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
27963 .addReg(OffsetReg64)
27964 .addReg(RegSaveReg);
27966 // Compute the offset for the next argument
27967 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
27968 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
27970 .addImm(UseFPOffset ? 16 : 8);
27972 // Store it back into the va_list.
27973 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
27977 .addDisp(Disp, UseFPOffset ? 4 : 0)
27979 .addReg(NextOffsetReg)
27983 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
27988 // Emit code to use overflow area
27991 // Load the overflow_area address into a register.
27992 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
27993 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
28001 // If we need to align it, do so. Otherwise, just copy the address
28002 // to OverflowDestReg.
28004 // Align the overflow address
28005 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
28006 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
28008 // aligned_addr = (addr + (align-1)) & ~(align-1)
28009 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
28010 .addReg(OverflowAddrReg)
28013 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
28015 .addImm(~(uint64_t)(Align-1));
28017 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
28018 .addReg(OverflowAddrReg);
28021 // Compute the next overflow address after this argument.
28022 // (the overflow address should be kept 8-byte aligned)
28023 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
28024 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
28025 .addReg(OverflowDestReg)
28026 .addImm(ArgSizeA8);
28028 // Store the new overflow address.
28029 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
28035 .addReg(NextAddrReg)
28038 // If we branched, emit the PHI to the front of endMBB.
28040 BuildMI(*endMBB, endMBB->begin(), DL,
28041 TII->get(X86::PHI), DestReg)
28042 .addReg(OffsetDestReg).addMBB(offsetMBB)
28043 .addReg(OverflowDestReg).addMBB(overflowMBB);
28046 // Erase the pseudo instruction
28047 MI.eraseFromParent();
28052 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
28053 MachineInstr &MI, MachineBasicBlock *MBB) const {
28054 // Emit code to save XMM registers to the stack. The ABI says that the
28055 // number of registers to save is given in %al, so it's theoretically
28056 // possible to do an indirect jump trick to avoid saving all of them,
28057 // however this code takes a simpler approach and just executes all
28058 // of the stores if %al is non-zero. It's less code, and it's probably
28059 // easier on the hardware branch predictor, and stores aren't all that
28060 // expensive anyway.
28062 // Create the new basic blocks. One block contains all the XMM stores,
28063 // and one block is the final destination regardless of whether any
28064 // stores were performed.
28065 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
28066 MachineFunction *F = MBB->getParent();
28067 MachineFunction::iterator MBBIter = ++MBB->getIterator();
28068 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
28069 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
28070 F->insert(MBBIter, XMMSaveMBB);
28071 F->insert(MBBIter, EndMBB);
28073 // Transfer the remainder of MBB and its successor edges to EndMBB.
28074 EndMBB->splice(EndMBB->begin(), MBB,
28075 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
28076 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
28078 // The original block will now fall through to the XMM save block.
28079 MBB->addSuccessor(XMMSaveMBB);
28080 // The XMMSaveMBB will fall through to the end block.
28081 XMMSaveMBB->addSuccessor(EndMBB);
28083 // Now add the instructions.
28084 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28085 DebugLoc DL = MI.getDebugLoc();
28087 unsigned CountReg = MI.getOperand(0).getReg();
28088 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
28089 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
28091 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
28092 // If %al is 0, branch around the XMM save block.
28093 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
28094 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
28095 MBB->addSuccessor(EndMBB);
28098 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
28099 // that was just emitted, but clearly shouldn't be "saved".
28100 assert((MI.getNumOperands() <= 3 ||
28101 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
28102 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
28103 "Expected last argument to be EFLAGS");
28104 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
28105 // In the XMM save block, save all the XMM argument registers.
28106 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
28107 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
28108 MachineMemOperand *MMO = F->getMachineMemOperand(
28109 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
28110 MachineMemOperand::MOStore,
28111 /*Size=*/16, /*Align=*/16);
28112 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
28113 .addFrameIndex(RegSaveFrameIndex)
28114 .addImm(/*Scale=*/1)
28115 .addReg(/*IndexReg=*/0)
28116 .addImm(/*Disp=*/Offset)
28117 .addReg(/*Segment=*/0)
28118 .addReg(MI.getOperand(i).getReg())
28119 .addMemOperand(MMO);
28122 MI.eraseFromParent(); // The pseudo instruction is gone now.
28127 // The EFLAGS operand of SelectItr might be missing a kill marker
28128 // because there were multiple uses of EFLAGS, and ISel didn't know
28129 // which to mark. Figure out whether SelectItr should have had a
28130 // kill marker, and set it if it should. Returns the correct kill
28132 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
28133 MachineBasicBlock* BB,
28134 const TargetRegisterInfo* TRI) {
28135 // Scan forward through BB for a use/def of EFLAGS.
28136 MachineBasicBlock::iterator miI(std::next(SelectItr));
28137 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
28138 const MachineInstr& mi = *miI;
28139 if (mi.readsRegister(X86::EFLAGS))
28141 if (mi.definesRegister(X86::EFLAGS))
28142 break; // Should have kill-flag - update below.
28145 // If we hit the end of the block, check whether EFLAGS is live into a
28147 if (miI == BB->end()) {
28148 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
28149 sEnd = BB->succ_end();
28150 sItr != sEnd; ++sItr) {
28151 MachineBasicBlock* succ = *sItr;
28152 if (succ->isLiveIn(X86::EFLAGS))
28157 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
28158 // out. SelectMI should have a kill flag on EFLAGS.
28159 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
28163 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
28164 // together with other CMOV pseudo-opcodes into a single basic-block with
28165 // conditional jump around it.
28166 static bool isCMOVPseudo(MachineInstr &MI) {
28167 switch (MI.getOpcode()) {
28168 case X86::CMOV_FR32:
28169 case X86::CMOV_FR64:
28170 case X86::CMOV_GR8:
28171 case X86::CMOV_GR16:
28172 case X86::CMOV_GR32:
28173 case X86::CMOV_RFP32:
28174 case X86::CMOV_RFP64:
28175 case X86::CMOV_RFP80:
28176 case X86::CMOV_VR128:
28177 case X86::CMOV_VR128X:
28178 case X86::CMOV_VR256:
28179 case X86::CMOV_VR256X:
28180 case X86::CMOV_VR512:
28181 case X86::CMOV_VK2:
28182 case X86::CMOV_VK4:
28183 case X86::CMOV_VK8:
28184 case X86::CMOV_VK16:
28185 case X86::CMOV_VK32:
28186 case X86::CMOV_VK64:
28194 // Helper function, which inserts PHI functions into SinkMBB:
28195 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
28196 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
28197 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
28198 // the last PHI function inserted.
28199 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
28200 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
28201 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
28202 MachineBasicBlock *SinkMBB) {
28203 MachineFunction *MF = TrueMBB->getParent();
28204 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
28205 DebugLoc DL = MIItBegin->getDebugLoc();
28207 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
28208 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
28210 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
28212 // As we are creating the PHIs, we have to be careful if there is more than
28213 // one. Later CMOVs may reference the results of earlier CMOVs, but later
28214 // PHIs have to reference the individual true/false inputs from earlier PHIs.
28215 // That also means that PHI construction must work forward from earlier to
28216 // later, and that the code must maintain a mapping from earlier PHI's
28217 // destination registers, and the registers that went into the PHI.
28218 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
28219 MachineInstrBuilder MIB;
28221 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
28222 unsigned DestReg = MIIt->getOperand(0).getReg();
28223 unsigned Op1Reg = MIIt->getOperand(1).getReg();
28224 unsigned Op2Reg = MIIt->getOperand(2).getReg();
28226 // If this CMOV we are generating is the opposite condition from
28227 // the jump we generated, then we have to swap the operands for the
28228 // PHI that is going to be generated.
28229 if (MIIt->getOperand(3).getImm() == OppCC)
28230 std::swap(Op1Reg, Op2Reg);
28232 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
28233 Op1Reg = RegRewriteTable[Op1Reg].first;
28235 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
28236 Op2Reg = RegRewriteTable[Op2Reg].second;
28238 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
28244 // Add this PHI to the rewrite table.
28245 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
28251 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
28252 MachineBasicBlock *
28253 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
28254 MachineInstr &SecondCascadedCMOV,
28255 MachineBasicBlock *ThisMBB) const {
28256 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28257 DebugLoc DL = FirstCMOV.getDebugLoc();
28259 // We lower cascaded CMOVs such as
28261 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
28263 // to two successive branches.
28265 // Without this, we would add a PHI between the two jumps, which ends up
28266 // creating a few copies all around. For instance, for
28268 // (sitofp (zext (fcmp une)))
28270 // we would generate:
28272 // ucomiss %xmm1, %xmm0
28273 // movss <1.0f>, %xmm0
28274 // movaps %xmm0, %xmm1
28276 // xorps %xmm1, %xmm1
28279 // movaps %xmm1, %xmm0
28283 // because this custom-inserter would have generated:
28295 // A: X = ...; Y = ...
28297 // C: Z = PHI [X, A], [Y, B]
28299 // E: PHI [X, C], [Z, D]
28301 // If we lower both CMOVs in a single step, we can instead generate:
28313 // A: X = ...; Y = ...
28315 // E: PHI [X, A], [X, C], [Y, D]
28317 // Which, in our sitofp/fcmp example, gives us something like:
28319 // ucomiss %xmm1, %xmm0
28320 // movss <1.0f>, %xmm0
28323 // xorps %xmm0, %xmm0
28328 // We lower cascaded CMOV into two successive branches to the same block.
28329 // EFLAGS is used by both, so mark it as live in the second.
28330 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
28331 MachineFunction *F = ThisMBB->getParent();
28332 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
28333 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
28334 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
28336 MachineFunction::iterator It = ++ThisMBB->getIterator();
28337 F->insert(It, FirstInsertedMBB);
28338 F->insert(It, SecondInsertedMBB);
28339 F->insert(It, SinkMBB);
28341 // For a cascaded CMOV, we lower it to two successive branches to
28342 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
28343 // the FirstInsertedMBB.
28344 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
28346 // If the EFLAGS register isn't dead in the terminator, then claim that it's
28347 // live into the sink and copy blocks.
28348 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28349 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
28350 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
28351 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
28352 SinkMBB->addLiveIn(X86::EFLAGS);
28355 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
28356 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
28357 std::next(MachineBasicBlock::iterator(FirstCMOV)),
28359 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
28361 // Fallthrough block for ThisMBB.
28362 ThisMBB->addSuccessor(FirstInsertedMBB);
28363 // The true block target of the first branch is always SinkMBB.
28364 ThisMBB->addSuccessor(SinkMBB);
28365 // Fallthrough block for FirstInsertedMBB.
28366 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
28367 // The true block for the branch of FirstInsertedMBB.
28368 FirstInsertedMBB->addSuccessor(SinkMBB);
28369 // This is fallthrough.
28370 SecondInsertedMBB->addSuccessor(SinkMBB);
28372 // Create the conditional branch instructions.
28373 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
28374 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
28375 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
28377 X86::CondCode SecondCC =
28378 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
28379 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
28380 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
28383 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
28384 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
28385 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
28386 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
28387 MachineInstrBuilder MIB =
28388 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
28390 .addMBB(SecondInsertedMBB)
28394 // The second SecondInsertedMBB provides the same incoming value as the
28395 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
28396 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
28397 // Copy the PHI result to the register defined by the second CMOV.
28398 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
28399 TII->get(TargetOpcode::COPY),
28400 SecondCascadedCMOV.getOperand(0).getReg())
28401 .addReg(FirstCMOV.getOperand(0).getReg());
28403 // Now remove the CMOVs.
28404 FirstCMOV.eraseFromParent();
28405 SecondCascadedCMOV.eraseFromParent();
28410 MachineBasicBlock *
28411 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
28412 MachineBasicBlock *ThisMBB) const {
28413 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28414 DebugLoc DL = MI.getDebugLoc();
28416 // To "insert" a SELECT_CC instruction, we actually have to insert the
28417 // diamond control-flow pattern. The incoming instruction knows the
28418 // destination vreg to set, the condition code register to branch on, the
28419 // true/false values to select between and a branch opcode to use.
28424 // cmpTY ccX, r1, r2
28426 // fallthrough --> FalseMBB
28428 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
28429 // as described above, by inserting a BB, and then making a PHI at the join
28430 // point to select the true and false operands of the CMOV in the PHI.
28432 // The code also handles two different cases of multiple CMOV opcodes
28436 // In this case, there are multiple CMOVs in a row, all which are based on
28437 // the same condition setting (or the exact opposite condition setting).
28438 // In this case we can lower all the CMOVs using a single inserted BB, and
28439 // then make a number of PHIs at the join point to model the CMOVs. The only
28440 // trickiness here, is that in a case like:
28442 // t2 = CMOV cond1 t1, f1
28443 // t3 = CMOV cond1 t2, f2
28445 // when rewriting this into PHIs, we have to perform some renaming on the
28446 // temps since you cannot have a PHI operand refer to a PHI result earlier
28447 // in the same block. The "simple" but wrong lowering would be:
28449 // t2 = PHI t1(BB1), f1(BB2)
28450 // t3 = PHI t2(BB1), f2(BB2)
28452 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
28453 // renaming is to note that on the path through BB1, t2 is really just a
28454 // copy of t1, and do that renaming, properly generating:
28456 // t2 = PHI t1(BB1), f1(BB2)
28457 // t3 = PHI t1(BB1), f2(BB2)
28460 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
28461 // function - EmitLoweredCascadedSelect.
28463 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
28464 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
28465 MachineInstr *LastCMOV = &MI;
28466 MachineBasicBlock::iterator NextMIIt =
28467 std::next(MachineBasicBlock::iterator(MI));
28469 // Check for case 1, where there are multiple CMOVs with the same condition
28470 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
28471 // number of jumps the most.
28473 if (isCMOVPseudo(MI)) {
28474 // See if we have a string of CMOVS with the same condition.
28475 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
28476 (NextMIIt->getOperand(3).getImm() == CC ||
28477 NextMIIt->getOperand(3).getImm() == OppCC)) {
28478 LastCMOV = &*NextMIIt;
28483 // This checks for case 2, but only do this if we didn't already find
28484 // case 1, as indicated by LastCMOV == MI.
28485 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
28486 NextMIIt->getOpcode() == MI.getOpcode() &&
28487 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
28488 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
28489 NextMIIt->getOperand(1).isKill()) {
28490 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
28493 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
28494 MachineFunction *F = ThisMBB->getParent();
28495 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
28496 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
28498 MachineFunction::iterator It = ++ThisMBB->getIterator();
28499 F->insert(It, FalseMBB);
28500 F->insert(It, SinkMBB);
28502 // If the EFLAGS register isn't dead in the terminator, then claim that it's
28503 // live into the sink and copy blocks.
28504 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28505 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
28506 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
28507 FalseMBB->addLiveIn(X86::EFLAGS);
28508 SinkMBB->addLiveIn(X86::EFLAGS);
28511 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
28512 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
28513 std::next(MachineBasicBlock::iterator(LastCMOV)),
28515 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
28517 // Fallthrough block for ThisMBB.
28518 ThisMBB->addSuccessor(FalseMBB);
28519 // The true block target of the first (or only) branch is always a SinkMBB.
28520 ThisMBB->addSuccessor(SinkMBB);
28521 // Fallthrough block for FalseMBB.
28522 FalseMBB->addSuccessor(SinkMBB);
28524 // Create the conditional branch instruction.
28525 unsigned Opc = X86::GetCondBranchFromCond(CC);
28526 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
28529 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
28531 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
28532 MachineBasicBlock::iterator MIItEnd =
28533 std::next(MachineBasicBlock::iterator(LastCMOV));
28534 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
28536 // Now remove the CMOV(s).
28537 ThisMBB->erase(MIItBegin, MIItEnd);
28542 MachineBasicBlock *
28543 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
28544 MachineBasicBlock *BB) const {
28545 // Combine the following atomic floating-point modification pattern:
28546 // a.store(reg OP a.load(acquire), release)
28547 // Transform them into:
28548 // OPss (%gpr), %xmm
28549 // movss %xmm, (%gpr)
28550 // Or sd equivalent for 64-bit operations.
28552 switch (MI.getOpcode()) {
28553 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
28554 case X86::RELEASE_FADD32mr:
28555 FOp = X86::ADDSSrm;
28556 MOp = X86::MOVSSmr;
28558 case X86::RELEASE_FADD64mr:
28559 FOp = X86::ADDSDrm;
28560 MOp = X86::MOVSDmr;
28563 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28564 DebugLoc DL = MI.getDebugLoc();
28565 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
28566 unsigned ValOpIdx = X86::AddrNumOperands;
28567 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
28568 MachineInstrBuilder MIB =
28569 BuildMI(*BB, MI, DL, TII->get(FOp),
28570 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
28572 for (int i = 0; i < X86::AddrNumOperands; ++i) {
28573 MachineOperand &Operand = MI.getOperand(i);
28574 // Clear any kill flags on register operands as we'll create a second
28575 // instruction using the same address operands.
28576 if (Operand.isReg())
28577 Operand.setIsKill(false);
28580 MachineInstr *FOpMI = MIB;
28581 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
28582 for (int i = 0; i < X86::AddrNumOperands; ++i)
28583 MIB.add(MI.getOperand(i));
28584 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
28585 MI.eraseFromParent(); // The pseudo instruction is gone now.
28589 MachineBasicBlock *
28590 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
28591 MachineBasicBlock *BB) const {
28592 MachineFunction *MF = BB->getParent();
28593 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28594 DebugLoc DL = MI.getDebugLoc();
28595 const BasicBlock *LLVM_BB = BB->getBasicBlock();
28597 assert(MF->shouldSplitStack());
28599 const bool Is64Bit = Subtarget.is64Bit();
28600 const bool IsLP64 = Subtarget.isTarget64BitLP64();
28602 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
28603 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
28606 // ... [Till the alloca]
28607 // If stacklet is not large enough, jump to mallocMBB
28610 // Allocate by subtracting from RSP
28611 // Jump to continueMBB
28614 // Allocate by call to runtime
28618 // [rest of original BB]
28621 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
28622 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
28623 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
28625 MachineRegisterInfo &MRI = MF->getRegInfo();
28626 const TargetRegisterClass *AddrRegClass =
28627 getRegClassFor(getPointerTy(MF->getDataLayout()));
28629 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
28630 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
28631 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
28632 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
28633 sizeVReg = MI.getOperand(1).getReg(),
28635 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
28637 MachineFunction::iterator MBBIter = ++BB->getIterator();
28639 MF->insert(MBBIter, bumpMBB);
28640 MF->insert(MBBIter, mallocMBB);
28641 MF->insert(MBBIter, continueMBB);
28643 continueMBB->splice(continueMBB->begin(), BB,
28644 std::next(MachineBasicBlock::iterator(MI)), BB->end());
28645 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
28647 // Add code to the main basic block to check if the stack limit has been hit,
28648 // and if so, jump to mallocMBB otherwise to bumpMBB.
28649 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
28650 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
28651 .addReg(tmpSPVReg).addReg(sizeVReg);
28652 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
28653 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
28654 .addReg(SPLimitVReg);
28655 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
28657 // bumpMBB simply decreases the stack pointer, since we know the current
28658 // stacklet has enough space.
28659 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
28660 .addReg(SPLimitVReg);
28661 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
28662 .addReg(SPLimitVReg);
28663 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
28665 // Calls into a routine in libgcc to allocate more space from the heap.
28666 const uint32_t *RegMask =
28667 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
28669 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
28671 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
28672 .addExternalSymbol("__morestack_allocate_stack_space")
28673 .addRegMask(RegMask)
28674 .addReg(X86::RDI, RegState::Implicit)
28675 .addReg(X86::RAX, RegState::ImplicitDefine);
28676 } else if (Is64Bit) {
28677 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
28679 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
28680 .addExternalSymbol("__morestack_allocate_stack_space")
28681 .addRegMask(RegMask)
28682 .addReg(X86::EDI, RegState::Implicit)
28683 .addReg(X86::EAX, RegState::ImplicitDefine);
28685 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
28687 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
28688 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
28689 .addExternalSymbol("__morestack_allocate_stack_space")
28690 .addRegMask(RegMask)
28691 .addReg(X86::EAX, RegState::ImplicitDefine);
28695 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
28698 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
28699 .addReg(IsLP64 ? X86::RAX : X86::EAX);
28700 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
28702 // Set up the CFG correctly.
28703 BB->addSuccessor(bumpMBB);
28704 BB->addSuccessor(mallocMBB);
28705 mallocMBB->addSuccessor(continueMBB);
28706 bumpMBB->addSuccessor(continueMBB);
28708 // Take care of the PHI nodes.
28709 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
28710 MI.getOperand(0).getReg())
28711 .addReg(mallocPtrVReg)
28713 .addReg(bumpSPPtrVReg)
28716 // Delete the original pseudo instruction.
28717 MI.eraseFromParent();
28720 return continueMBB;
28723 MachineBasicBlock *
28724 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
28725 MachineBasicBlock *BB) const {
28726 MachineFunction *MF = BB->getParent();
28727 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
28728 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
28729 DebugLoc DL = MI.getDebugLoc();
28731 assert(!isAsynchronousEHPersonality(
28732 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
28733 "SEH does not use catchret!");
28735 // Only 32-bit EH needs to worry about manually restoring stack pointers.
28736 if (!Subtarget.is32Bit())
28739 // C++ EH creates a new target block to hold the restore code, and wires up
28740 // the new block to the return destination with a normal JMP_4.
28741 MachineBasicBlock *RestoreMBB =
28742 MF->CreateMachineBasicBlock(BB->getBasicBlock());
28743 assert(BB->succ_size() == 1);
28744 MF->insert(std::next(BB->getIterator()), RestoreMBB);
28745 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
28746 BB->addSuccessor(RestoreMBB);
28747 MI.getOperand(0).setMBB(RestoreMBB);
28749 auto RestoreMBBI = RestoreMBB->begin();
28750 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
28751 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
28755 MachineBasicBlock *
28756 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
28757 MachineBasicBlock *BB) const {
28758 MachineFunction *MF = BB->getParent();
28759 const Constant *PerFn = MF->getFunction().getPersonalityFn();
28760 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
28761 // Only 32-bit SEH requires special handling for catchpad.
28762 if (IsSEH && Subtarget.is32Bit()) {
28763 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
28764 DebugLoc DL = MI.getDebugLoc();
28765 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
28767 MI.eraseFromParent();
28771 MachineBasicBlock *
28772 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
28773 MachineBasicBlock *BB) const {
28774 // So, here we replace TLSADDR with the sequence:
28775 // adjust_stackdown -> TLSADDR -> adjust_stackup.
28776 // We need this because TLSADDR is lowered into calls
28777 // inside MC, therefore without the two markers shrink-wrapping
28778 // may push the prologue/epilogue pass them.
28779 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
28780 DebugLoc DL = MI.getDebugLoc();
28781 MachineFunction &MF = *BB->getParent();
28783 // Emit CALLSEQ_START right before the instruction.
28784 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
28785 MachineInstrBuilder CallseqStart =
28786 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
28787 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
28789 // Emit CALLSEQ_END right after the instruction.
28790 // We don't call erase from parent because we want to keep the
28791 // original instruction around.
28792 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
28793 MachineInstrBuilder CallseqEnd =
28794 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
28795 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
28800 MachineBasicBlock *
28801 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
28802 MachineBasicBlock *BB) const {
28803 // This is pretty easy. We're taking the value that we received from
28804 // our load from the relocation, sticking it in either RDI (x86-64)
28805 // or EAX and doing an indirect call. The return value will then
28806 // be in the normal return register.
28807 MachineFunction *F = BB->getParent();
28808 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28809 DebugLoc DL = MI.getDebugLoc();
28811 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
28812 assert(MI.getOperand(3).isGlobal() && "This should be a global");
28814 // Get a register mask for the lowered call.
28815 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
28816 // proper register mask.
28817 const uint32_t *RegMask =
28818 Subtarget.is64Bit() ?
28819 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
28820 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
28821 if (Subtarget.is64Bit()) {
28822 MachineInstrBuilder MIB =
28823 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
28827 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
28828 MI.getOperand(3).getTargetFlags())
28830 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
28831 addDirectMem(MIB, X86::RDI);
28832 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
28833 } else if (!isPositionIndependent()) {
28834 MachineInstrBuilder MIB =
28835 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
28839 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
28840 MI.getOperand(3).getTargetFlags())
28842 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
28843 addDirectMem(MIB, X86::EAX);
28844 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
28846 MachineInstrBuilder MIB =
28847 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
28848 .addReg(TII->getGlobalBaseReg(F))
28851 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
28852 MI.getOperand(3).getTargetFlags())
28854 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
28855 addDirectMem(MIB, X86::EAX);
28856 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
28859 MI.eraseFromParent(); // The pseudo instruction is gone now.
28863 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
28865 case X86::RETPOLINE_CALL32:
28866 return X86::CALLpcrel32;
28867 case X86::RETPOLINE_CALL64:
28868 return X86::CALL64pcrel32;
28869 case X86::RETPOLINE_TCRETURN32:
28870 return X86::TCRETURNdi;
28871 case X86::RETPOLINE_TCRETURN64:
28872 return X86::TCRETURNdi64;
28874 llvm_unreachable("not retpoline opcode");
28877 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
28879 if (Subtarget.useRetpolineExternalThunk()) {
28880 // When using an external thunk for retpolines, we pick names that match the
28881 // names GCC happens to use as well. This helps simplify the implementation
28882 // of the thunks for kernels where they have no easy ability to create
28883 // aliases and are doing non-trivial configuration of the thunk's body. For
28884 // example, the Linux kernel will do boot-time hot patching of the thunk
28885 // bodies and cannot easily export aliases of these to loaded modules.
28887 // Note that at any point in the future, we may need to change the semantics
28888 // of how we implement retpolines and at that time will likely change the
28889 // name of the called thunk. Essentially, there is no hard guarantee that
28890 // LLVM will generate calls to specific thunks, we merely make a best-effort
28891 // attempt to help out kernels and other systems where duplicating the
28892 // thunks is costly.
28895 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
28896 return "__x86_indirect_thunk_eax";
28898 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
28899 return "__x86_indirect_thunk_ecx";
28901 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
28902 return "__x86_indirect_thunk_edx";
28904 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
28905 return "__x86_indirect_thunk_edi";
28907 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
28908 return "__x86_indirect_thunk_r11";
28910 llvm_unreachable("unexpected reg for retpoline");
28913 // When targeting an internal COMDAT thunk use an LLVM-specific name.
28916 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
28917 return "__llvm_retpoline_eax";
28919 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
28920 return "__llvm_retpoline_ecx";
28922 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
28923 return "__llvm_retpoline_edx";
28925 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
28926 return "__llvm_retpoline_edi";
28928 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
28929 return "__llvm_retpoline_r11";
28931 llvm_unreachable("unexpected reg for retpoline");
28934 MachineBasicBlock *
28935 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
28936 MachineBasicBlock *BB) const {
28937 // Copy the virtual register into the R11 physical register and
28938 // call the retpoline thunk.
28939 DebugLoc DL = MI.getDebugLoc();
28940 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28941 unsigned CalleeVReg = MI.getOperand(0).getReg();
28942 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
28944 // Find an available scratch register to hold the callee. On 64-bit, we can
28945 // just use R11, but we scan for uses anyway to ensure we don't generate
28946 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
28947 // already a register use operand to the call to hold the callee. If none
28948 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
28949 // register and ESI is the base pointer to realigned stack frames with VLAs.
28950 SmallVector<unsigned, 3> AvailableRegs;
28951 if (Subtarget.is64Bit())
28952 AvailableRegs.push_back(X86::R11);
28954 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
28956 // Zero out any registers that are already used.
28957 for (const auto &MO : MI.operands()) {
28958 if (MO.isReg() && MO.isUse())
28959 for (unsigned &Reg : AvailableRegs)
28960 if (Reg == MO.getReg())
28964 // Choose the first remaining non-zero available register.
28965 unsigned AvailableReg = 0;
28966 for (unsigned MaybeReg : AvailableRegs) {
28968 AvailableReg = MaybeReg;
28973 report_fatal_error("calling convention incompatible with retpoline, no "
28974 "available registers");
28976 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
28978 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
28979 .addReg(CalleeVReg);
28980 MI.getOperand(0).ChangeToES(Symbol);
28981 MI.setDesc(TII->get(Opc));
28982 MachineInstrBuilder(*BB->getParent(), &MI)
28983 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
28987 /// SetJmp implies future control flow change upon calling the corresponding
28989 /// Instead of using the 'return' instruction, the long jump fixes the stack and
28990 /// performs an indirect branch. To do so it uses the registers that were stored
28991 /// in the jump buffer (when calling SetJmp).
28992 /// In case the shadow stack is enabled we need to fix it as well, because some
28993 /// return addresses will be skipped.
28994 /// The function will save the SSP for future fixing in the function
28995 /// emitLongJmpShadowStackFix.
28996 /// \sa emitLongJmpShadowStackFix
28997 /// \param [in] MI The temporary Machine Instruction for the builtin.
28998 /// \param [in] MBB The Machine Basic Block that will be modified.
28999 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
29000 MachineBasicBlock *MBB) const {
29001 DebugLoc DL = MI.getDebugLoc();
29002 MachineFunction *MF = MBB->getParent();
29003 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29004 MachineRegisterInfo &MRI = MF->getRegInfo();
29005 MachineInstrBuilder MIB;
29007 // Memory Reference.
29008 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
29009 MI.memoperands_end());
29011 // Initialize a register with zero.
29012 MVT PVT = getPointerTy(MF->getDataLayout());
29013 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
29014 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
29015 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
29016 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
29018 .addReg(ZReg, RegState::Undef)
29019 .addReg(ZReg, RegState::Undef);
29021 // Read the current SSP Register value to the zeroed register.
29022 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
29023 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
29024 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
29026 // Write the SSP register value to offset 3 in input memory buffer.
29027 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
29028 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
29029 const int64_t SSPOffset = 3 * PVT.getStoreSize();
29030 const unsigned MemOpndSlot = 1;
29031 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
29032 if (i == X86::AddrDisp)
29033 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
29035 MIB.add(MI.getOperand(MemOpndSlot + i));
29037 MIB.addReg(SSPCopyReg);
29038 MIB.setMemRefs(MMOs);
29041 MachineBasicBlock *
29042 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
29043 MachineBasicBlock *MBB) const {
29044 DebugLoc DL = MI.getDebugLoc();
29045 MachineFunction *MF = MBB->getParent();
29046 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29047 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
29048 MachineRegisterInfo &MRI = MF->getRegInfo();
29050 const BasicBlock *BB = MBB->getBasicBlock();
29051 MachineFunction::iterator I = ++MBB->getIterator();
29053 // Memory Reference
29054 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
29055 MI.memoperands_end());
29058 unsigned MemOpndSlot = 0;
29060 unsigned CurOp = 0;
29062 DstReg = MI.getOperand(CurOp++).getReg();
29063 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
29064 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
29066 unsigned mainDstReg = MRI.createVirtualRegister(RC);
29067 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
29069 MemOpndSlot = CurOp;
29071 MVT PVT = getPointerTy(MF->getDataLayout());
29072 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
29073 "Invalid Pointer Size!");
29075 // For v = setjmp(buf), we generate
29078 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
29079 // SjLjSetup restoreMBB
29085 // v = phi(main, restore)
29088 // if base pointer being used, load it from frame
29091 MachineBasicBlock *thisMBB = MBB;
29092 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
29093 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
29094 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
29095 MF->insert(I, mainMBB);
29096 MF->insert(I, sinkMBB);
29097 MF->push_back(restoreMBB);
29098 restoreMBB->setHasAddressTaken();
29100 MachineInstrBuilder MIB;
29102 // Transfer the remainder of BB and its successor edges to sinkMBB.
29103 sinkMBB->splice(sinkMBB->begin(), MBB,
29104 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
29105 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
29108 unsigned PtrStoreOpc = 0;
29109 unsigned LabelReg = 0;
29110 const int64_t LabelOffset = 1 * PVT.getStoreSize();
29111 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
29112 !isPositionIndependent();
29114 // Prepare IP either in reg or imm.
29115 if (!UseImmLabel) {
29116 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
29117 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
29118 LabelReg = MRI.createVirtualRegister(PtrRC);
29119 if (Subtarget.is64Bit()) {
29120 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
29124 .addMBB(restoreMBB)
29127 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
29128 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
29129 .addReg(XII->getGlobalBaseReg(MF))
29132 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
29136 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
29138 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
29139 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
29140 if (i == X86::AddrDisp)
29141 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
29143 MIB.add(MI.getOperand(MemOpndSlot + i));
29146 MIB.addReg(LabelReg);
29148 MIB.addMBB(restoreMBB);
29149 MIB.setMemRefs(MMOs);
29151 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
29152 emitSetJmpShadowStackFix(MI, thisMBB);
29156 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
29157 .addMBB(restoreMBB);
29159 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29160 MIB.addRegMask(RegInfo->getNoPreservedMask());
29161 thisMBB->addSuccessor(mainMBB);
29162 thisMBB->addSuccessor(restoreMBB);
29166 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
29167 mainMBB->addSuccessor(sinkMBB);
29170 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
29171 TII->get(X86::PHI), DstReg)
29172 .addReg(mainDstReg).addMBB(mainMBB)
29173 .addReg(restoreDstReg).addMBB(restoreMBB);
29176 if (RegInfo->hasBasePointer(*MF)) {
29177 const bool Uses64BitFramePtr =
29178 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
29179 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
29180 X86FI->setRestoreBasePointer(MF);
29181 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
29182 unsigned BasePtr = RegInfo->getBaseRegister();
29183 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
29184 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
29185 FramePtr, true, X86FI->getRestoreBasePointerOffset())
29186 .setMIFlag(MachineInstr::FrameSetup);
29188 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
29189 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
29190 restoreMBB->addSuccessor(sinkMBB);
29192 MI.eraseFromParent();
29196 /// Fix the shadow stack using the previously saved SSP pointer.
29197 /// \sa emitSetJmpShadowStackFix
29198 /// \param [in] MI The temporary Machine Instruction for the builtin.
29199 /// \param [in] MBB The Machine Basic Block that will be modified.
29200 /// \return The sink MBB that will perform the future indirect branch.
29201 MachineBasicBlock *
29202 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
29203 MachineBasicBlock *MBB) const {
29204 DebugLoc DL = MI.getDebugLoc();
29205 MachineFunction *MF = MBB->getParent();
29206 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29207 MachineRegisterInfo &MRI = MF->getRegInfo();
29209 // Memory Reference
29210 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
29211 MI.memoperands_end());
29213 MVT PVT = getPointerTy(MF->getDataLayout());
29214 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
29217 // xor vreg1, vreg1
29219 // test vreg1, vreg1
29220 // je sinkMBB # Jump if Shadow Stack is not supported
29222 // mov buf+24/12(%rip), vreg2
29223 // sub vreg1, vreg2
29224 // jbe sinkMBB # No need to fix the Shadow Stack
29227 // incssp vreg2 # fix the SSP according to the lower 8 bits
29230 // fixShadowLoopPrepareMBB:
29233 // fixShadowLoopMBB:
29236 // jne fixShadowLoopMBB # Iterate until you finish fixing
29237 // # the Shadow Stack
29240 MachineFunction::iterator I = ++MBB->getIterator();
29241 const BasicBlock *BB = MBB->getBasicBlock();
29243 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
29244 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
29245 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
29246 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
29247 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
29248 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
29249 MF->insert(I, checkSspMBB);
29250 MF->insert(I, fallMBB);
29251 MF->insert(I, fixShadowMBB);
29252 MF->insert(I, fixShadowLoopPrepareMBB);
29253 MF->insert(I, fixShadowLoopMBB);
29254 MF->insert(I, sinkMBB);
29256 // Transfer the remainder of BB and its successor edges to sinkMBB.
29257 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
29259 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
29261 MBB->addSuccessor(checkSspMBB);
29263 // Initialize a register with zero.
29264 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
29265 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
29266 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
29268 .addReg(ZReg, RegState::Undef)
29269 .addReg(ZReg, RegState::Undef);
29271 // Read the current SSP Register value to the zeroed register.
29272 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
29273 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
29274 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
29276 // Check whether the result of the SSP register is zero and jump directly
29278 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
29279 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
29280 .addReg(SSPCopyReg)
29281 .addReg(SSPCopyReg);
29282 BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
29283 checkSspMBB->addSuccessor(sinkMBB);
29284 checkSspMBB->addSuccessor(fallMBB);
29286 // Reload the previously saved SSP register value.
29287 unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
29288 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
29289 const int64_t SPPOffset = 3 * PVT.getStoreSize();
29290 MachineInstrBuilder MIB =
29291 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
29292 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
29293 const MachineOperand &MO = MI.getOperand(i);
29294 if (i == X86::AddrDisp)
29295 MIB.addDisp(MO, SPPOffset);
29296 else if (MO.isReg()) // Don't add the whole operand, we don't want to
29297 // preserve kill flags.
29298 MIB.addReg(MO.getReg());
29302 MIB.setMemRefs(MMOs);
29304 // Subtract the current SSP from the previous SSP.
29305 unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
29306 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
29307 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
29308 .addReg(PrevSSPReg)
29309 .addReg(SSPCopyReg);
29311 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
29312 BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
29313 fallMBB->addSuccessor(sinkMBB);
29314 fallMBB->addSuccessor(fixShadowMBB);
29316 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
29317 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
29318 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
29319 unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
29320 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
29324 // Increase SSP when looking only on the lower 8 bits of the delta.
29325 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
29326 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
29328 // Reset the lower 8 bits.
29329 unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
29330 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
29331 .addReg(SspFirstShrReg)
29334 // Jump if the result of the shift is zero.
29335 BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
29336 fixShadowMBB->addSuccessor(sinkMBB);
29337 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
29339 // Do a single shift left.
29340 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
29341 unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
29342 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
29343 .addReg(SspSecondShrReg);
29345 // Save the value 128 to a register (will be used next with incssp).
29346 unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
29347 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
29348 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
29350 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
29352 // Since incssp only looks at the lower 8 bits, we might need to do several
29353 // iterations of incssp until we finish fixing the shadow stack.
29354 unsigned DecReg = MRI.createVirtualRegister(PtrRC);
29355 unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
29356 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
29357 .addReg(SspAfterShlReg)
29358 .addMBB(fixShadowLoopPrepareMBB)
29360 .addMBB(fixShadowLoopMBB);
29362 // Every iteration we increase the SSP by 128.
29363 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
29365 // Every iteration we decrement the counter by 1.
29366 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
29367 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
29369 // Jump if the counter is not zero yet.
29370 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
29371 fixShadowLoopMBB->addSuccessor(sinkMBB);
29372 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
29377 MachineBasicBlock *
29378 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
29379 MachineBasicBlock *MBB) const {
29380 DebugLoc DL = MI.getDebugLoc();
29381 MachineFunction *MF = MBB->getParent();
29382 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29383 MachineRegisterInfo &MRI = MF->getRegInfo();
29385 // Memory Reference
29386 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
29387 MI.memoperands_end());
29389 MVT PVT = getPointerTy(MF->getDataLayout());
29390 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
29391 "Invalid Pointer Size!");
29393 const TargetRegisterClass *RC =
29394 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
29395 unsigned Tmp = MRI.createVirtualRegister(RC);
29396 // Since FP is only updated here but NOT referenced, it's treated as GPR.
29397 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29398 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
29399 unsigned SP = RegInfo->getStackRegister();
29401 MachineInstrBuilder MIB;
29403 const int64_t LabelOffset = 1 * PVT.getStoreSize();
29404 const int64_t SPOffset = 2 * PVT.getStoreSize();
29406 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
29407 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
29409 MachineBasicBlock *thisMBB = MBB;
29411 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
29412 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
29413 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
29417 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
29418 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
29419 const MachineOperand &MO = MI.getOperand(i);
29420 if (MO.isReg()) // Don't add the whole operand, we don't want to
29421 // preserve kill flags.
29422 MIB.addReg(MO.getReg());
29426 MIB.setMemRefs(MMOs);
29429 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
29430 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
29431 const MachineOperand &MO = MI.getOperand(i);
29432 if (i == X86::AddrDisp)
29433 MIB.addDisp(MO, LabelOffset);
29434 else if (MO.isReg()) // Don't add the whole operand, we don't want to
29435 // preserve kill flags.
29436 MIB.addReg(MO.getReg());
29440 MIB.setMemRefs(MMOs);
29443 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
29444 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
29445 if (i == X86::AddrDisp)
29446 MIB.addDisp(MI.getOperand(i), SPOffset);
29448 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
29449 // the last instruction of the expansion.
29451 MIB.setMemRefs(MMOs);
29454 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
29456 MI.eraseFromParent();
29460 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
29461 MachineBasicBlock *MBB,
29462 MachineBasicBlock *DispatchBB,
29464 DebugLoc DL = MI.getDebugLoc();
29465 MachineFunction *MF = MBB->getParent();
29466 MachineRegisterInfo *MRI = &MF->getRegInfo();
29467 const X86InstrInfo *TII = Subtarget.getInstrInfo();
29469 MVT PVT = getPointerTy(MF->getDataLayout());
29470 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
29475 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
29476 !isPositionIndependent();
29479 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
29481 const TargetRegisterClass *TRC =
29482 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
29483 VR = MRI->createVirtualRegister(TRC);
29484 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
29486 if (Subtarget.is64Bit())
29487 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
29491 .addMBB(DispatchBB)
29494 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
29495 .addReg(0) /* TII->getGlobalBaseReg(MF) */
29498 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
29502 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
29503 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
29505 MIB.addMBB(DispatchBB);
29510 MachineBasicBlock *
29511 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
29512 MachineBasicBlock *BB) const {
29513 DebugLoc DL = MI.getDebugLoc();
29514 MachineFunction *MF = BB->getParent();
29515 MachineFrameInfo &MFI = MF->getFrameInfo();
29516 MachineRegisterInfo *MRI = &MF->getRegInfo();
29517 const X86InstrInfo *TII = Subtarget.getInstrInfo();
29518 int FI = MFI.getFunctionContextIndex();
29520 // Get a mapping of the call site numbers to all of the landing pads they're
29521 // associated with.
29522 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
29523 unsigned MaxCSNum = 0;
29524 for (auto &MBB : *MF) {
29525 if (!MBB.isEHPad())
29528 MCSymbol *Sym = nullptr;
29529 for (const auto &MI : MBB) {
29530 if (MI.isDebugInstr())
29533 assert(MI.isEHLabel() && "expected EH_LABEL");
29534 Sym = MI.getOperand(0).getMCSymbol();
29538 if (!MF->hasCallSiteLandingPad(Sym))
29541 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
29542 CallSiteNumToLPad[CSI].push_back(&MBB);
29543 MaxCSNum = std::max(MaxCSNum, CSI);
29547 // Get an ordered list of the machine basic blocks for the jump table.
29548 std::vector<MachineBasicBlock *> LPadList;
29549 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
29550 LPadList.reserve(CallSiteNumToLPad.size());
29552 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
29553 for (auto &LP : CallSiteNumToLPad[CSI]) {
29554 LPadList.push_back(LP);
29555 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
29559 assert(!LPadList.empty() &&
29560 "No landing pad destinations for the dispatch jump table!");
29562 // Create the MBBs for the dispatch code.
29564 // Shove the dispatch's address into the return slot in the function context.
29565 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
29566 DispatchBB->setIsEHPad(true);
29568 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
29569 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
29570 DispatchBB->addSuccessor(TrapBB);
29572 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
29573 DispatchBB->addSuccessor(DispContBB);
29576 MF->push_back(DispatchBB);
29577 MF->push_back(DispContBB);
29578 MF->push_back(TrapBB);
29580 // Insert code into the entry block that creates and registers the function
29582 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
29584 // Create the jump table and associated information
29585 unsigned JTE = getJumpTableEncoding();
29586 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
29587 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
29589 const X86RegisterInfo &RI = TII->getRegisterInfo();
29590 // Add a register mask with no preserved registers. This results in all
29591 // registers being marked as clobbered.
29592 if (RI.hasBasePointer(*MF)) {
29593 const bool FPIs64Bit =
29594 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
29595 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
29596 MFI->setRestoreBasePointer(MF);
29598 unsigned FP = RI.getFrameRegister(*MF);
29599 unsigned BP = RI.getBaseRegister();
29600 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
29601 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
29602 MFI->getRestoreBasePointerOffset())
29603 .addRegMask(RI.getNoPreservedMask());
29605 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
29606 .addRegMask(RI.getNoPreservedMask());
29609 // IReg is used as an index in a memory operand and therefore can't be SP
29610 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
29611 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
29612 Subtarget.is64Bit() ? 8 : 4);
29613 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
29615 .addImm(LPadList.size());
29616 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
29618 if (Subtarget.is64Bit()) {
29619 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
29620 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
29622 // leaq .LJTI0_0(%rip), BReg
29623 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
29627 .addJumpTableIndex(MJTI)
29629 // movzx IReg64, IReg
29630 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
29633 .addImm(X86::sub_32bit);
29636 case MachineJumpTableInfo::EK_BlockAddress:
29637 // jmpq *(BReg,IReg64,8)
29638 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
29645 case MachineJumpTableInfo::EK_LabelDifference32: {
29646 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
29647 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
29648 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
29650 // movl (BReg,IReg64,4), OReg
29651 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
29657 // movsx OReg64, OReg
29658 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
29659 // addq BReg, OReg64, TReg
29660 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
29664 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
29668 llvm_unreachable("Unexpected jump table encoding");
29671 // jmpl *.LJTI0_0(,IReg,4)
29672 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
29676 .addJumpTableIndex(MJTI)
29680 // Add the jump table entries as successors to the MBB.
29681 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
29682 for (auto &LP : LPadList)
29683 if (SeenMBBs.insert(LP).second)
29684 DispContBB->addSuccessor(LP);
29686 // N.B. the order the invoke BBs are processed in doesn't matter here.
29687 SmallVector<MachineBasicBlock *, 64> MBBLPads;
29688 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
29689 for (MachineBasicBlock *MBB : InvokeBBs) {
29690 // Remove the landing pad successor from the invoke block and replace it
29691 // with the new dispatch block.
29692 // Keep a copy of Successors since it's modified inside the loop.
29693 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
29695 // FIXME: Avoid quadratic complexity.
29696 for (auto MBBS : Successors) {
29697 if (MBBS->isEHPad()) {
29698 MBB->removeSuccessor(MBBS);
29699 MBBLPads.push_back(MBBS);
29703 MBB->addSuccessor(DispatchBB);
29705 // Find the invoke call and mark all of the callee-saved registers as
29706 // 'implicit defined' so that they're spilled. This prevents code from
29707 // moving instructions to before the EH block, where they will never be
29709 for (auto &II : reverse(*MBB)) {
29713 DenseMap<unsigned, bool> DefRegs;
29714 for (auto &MOp : II.operands())
29716 DefRegs[MOp.getReg()] = true;
29718 MachineInstrBuilder MIB(*MF, &II);
29719 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
29720 unsigned Reg = SavedRegs[RI];
29722 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
29729 // Mark all former landing pads as non-landing pads. The dispatch is the only
29730 // landing pad now.
29731 for (auto &LP : MBBLPads)
29732 LP->setIsEHPad(false);
29734 // The instruction is gone now.
29735 MI.eraseFromParent();
29739 MachineBasicBlock *
29740 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
29741 MachineBasicBlock *BB) const {
29742 MachineFunction *MF = BB->getParent();
29743 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29744 DebugLoc DL = MI.getDebugLoc();
29746 switch (MI.getOpcode()) {
29747 default: llvm_unreachable("Unexpected instr type to insert");
29748 case X86::TLS_addr32:
29749 case X86::TLS_addr64:
29750 case X86::TLS_base_addr32:
29751 case X86::TLS_base_addr64:
29752 return EmitLoweredTLSAddr(MI, BB);
29753 case X86::RETPOLINE_CALL32:
29754 case X86::RETPOLINE_CALL64:
29755 case X86::RETPOLINE_TCRETURN32:
29756 case X86::RETPOLINE_TCRETURN64:
29757 return EmitLoweredRetpoline(MI, BB);
29758 case X86::CATCHRET:
29759 return EmitLoweredCatchRet(MI, BB);
29760 case X86::CATCHPAD:
29761 return EmitLoweredCatchPad(MI, BB);
29762 case X86::SEG_ALLOCA_32:
29763 case X86::SEG_ALLOCA_64:
29764 return EmitLoweredSegAlloca(MI, BB);
29765 case X86::TLSCall_32:
29766 case X86::TLSCall_64:
29767 return EmitLoweredTLSCall(MI, BB);
29768 case X86::CMOV_FR32:
29769 case X86::CMOV_FR64:
29770 case X86::CMOV_GR8:
29771 case X86::CMOV_GR16:
29772 case X86::CMOV_GR32:
29773 case X86::CMOV_RFP32:
29774 case X86::CMOV_RFP64:
29775 case X86::CMOV_RFP80:
29776 case X86::CMOV_VR128:
29777 case X86::CMOV_VR128X:
29778 case X86::CMOV_VR256:
29779 case X86::CMOV_VR256X:
29780 case X86::CMOV_VR512:
29781 case X86::CMOV_VK2:
29782 case X86::CMOV_VK4:
29783 case X86::CMOV_VK8:
29784 case X86::CMOV_VK16:
29785 case X86::CMOV_VK32:
29786 case X86::CMOV_VK64:
29787 return EmitLoweredSelect(MI, BB);
29789 case X86::RDFLAGS32:
29790 case X86::RDFLAGS64: {
29792 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
29793 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
29794 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
29795 // Permit reads of the EFLAGS and DF registers without them being defined.
29796 // This intrinsic exists to read external processor state in flags, such as
29797 // the trap flag, interrupt flag, and direction flag, none of which are
29798 // modeled by the backend.
29799 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
29800 "Unexpected register in operand!");
29801 Push->getOperand(2).setIsUndef();
29802 assert(Push->getOperand(3).getReg() == X86::DF &&
29803 "Unexpected register in operand!");
29804 Push->getOperand(3).setIsUndef();
29805 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
29807 MI.eraseFromParent(); // The pseudo is gone now.
29811 case X86::WRFLAGS32:
29812 case X86::WRFLAGS64: {
29814 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
29816 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
29817 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
29818 BuildMI(*BB, MI, DL, TII->get(PopF));
29820 MI.eraseFromParent(); // The pseudo is gone now.
29824 case X86::RELEASE_FADD32mr:
29825 case X86::RELEASE_FADD64mr:
29826 return EmitLoweredAtomicFP(MI, BB);
29828 case X86::FP32_TO_INT16_IN_MEM:
29829 case X86::FP32_TO_INT32_IN_MEM:
29830 case X86::FP32_TO_INT64_IN_MEM:
29831 case X86::FP64_TO_INT16_IN_MEM:
29832 case X86::FP64_TO_INT32_IN_MEM:
29833 case X86::FP64_TO_INT64_IN_MEM:
29834 case X86::FP80_TO_INT16_IN_MEM:
29835 case X86::FP80_TO_INT32_IN_MEM:
29836 case X86::FP80_TO_INT64_IN_MEM: {
29837 // Change the floating point control register to use "round towards zero"
29838 // mode when truncating to an integer value.
29839 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
29840 addFrameReference(BuildMI(*BB, MI, DL,
29841 TII->get(X86::FNSTCW16m)), CWFrameIdx);
29843 // Load the old value of the high byte of the control word...
29845 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
29846 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
29849 // Set the high part to be round to zero...
29850 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
29853 // Reload the modified control word now...
29854 addFrameReference(BuildMI(*BB, MI, DL,
29855 TII->get(X86::FLDCW16m)), CWFrameIdx);
29857 // Restore the memory image of control word to original value
29858 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
29861 // Get the X86 opcode to use.
29863 switch (MI.getOpcode()) {
29864 default: llvm_unreachable("illegal opcode!");
29865 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
29866 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
29867 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
29868 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
29869 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
29870 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
29871 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
29872 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
29873 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
29876 X86AddressMode AM = getAddressFromInstr(&MI, 0);
29877 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
29878 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
29880 // Reload the original control word now.
29881 addFrameReference(BuildMI(*BB, MI, DL,
29882 TII->get(X86::FLDCW16m)), CWFrameIdx);
29884 MI.eraseFromParent(); // The pseudo instruction is gone now.
29887 // Thread synchronization.
29889 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
29890 case X86::MONITORX:
29891 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
29895 return emitClzero(&MI, BB, Subtarget);
29899 return emitWRPKRU(MI, BB, Subtarget);
29901 return emitRDPKRU(MI, BB, Subtarget);
29904 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
29906 case X86::VASTART_SAVE_XMM_REGS:
29907 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
29909 case X86::VAARG_64:
29910 return EmitVAARG64WithCustomInserter(MI, BB);
29912 case X86::EH_SjLj_SetJmp32:
29913 case X86::EH_SjLj_SetJmp64:
29914 return emitEHSjLjSetJmp(MI, BB);
29916 case X86::EH_SjLj_LongJmp32:
29917 case X86::EH_SjLj_LongJmp64:
29918 return emitEHSjLjLongJmp(MI, BB);
29920 case X86::Int_eh_sjlj_setup_dispatch:
29921 return EmitSjLjDispatchBlock(MI, BB);
29923 case TargetOpcode::STATEPOINT:
29924 // As an implementation detail, STATEPOINT shares the STACKMAP format at
29925 // this point in the process. We diverge later.
29926 return emitPatchPoint(MI, BB);
29928 case TargetOpcode::STACKMAP:
29929 case TargetOpcode::PATCHPOINT:
29930 return emitPatchPoint(MI, BB);
29932 case TargetOpcode::PATCHABLE_EVENT_CALL:
29933 return emitXRayCustomEvent(MI, BB);
29935 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
29936 return emitXRayTypedEvent(MI, BB);
29938 case X86::LCMPXCHG8B: {
29939 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
29940 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
29941 // requires a memory operand. If it happens that current architecture is
29942 // i686 and for current function we need a base pointer
29943 // - which is ESI for i686 - register allocator would not be able to
29944 // allocate registers for an address in form of X(%reg, %reg, Y)
29945 // - there never would be enough unreserved registers during regalloc
29946 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
29947 // We are giving a hand to register allocator by precomputing the address in
29948 // a new vreg using LEA.
29950 // If it is not i686 or there is no base pointer - nothing to do here.
29951 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
29954 // Even though this code does not necessarily needs the base pointer to
29955 // be ESI, we check for that. The reason: if this assert fails, there are
29956 // some changes happened in the compiler base pointer handling, which most
29957 // probably have to be addressed somehow here.
29958 assert(TRI->getBaseRegister() == X86::ESI &&
29959 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
29960 "base pointer in mind");
29962 MachineRegisterInfo &MRI = MF->getRegInfo();
29963 MVT SPTy = getPointerTy(MF->getDataLayout());
29964 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
29965 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
29967 X86AddressMode AM = getAddressFromInstr(&MI, 0);
29968 // Regalloc does not need any help when the memory operand of CMPXCHG8B
29969 // does not use index register.
29970 if (AM.IndexReg == X86::NoRegister)
29973 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
29974 // four operand definitions that are E[ABCD] registers. We skip them and
29975 // then insert the LEA.
29976 MachineBasicBlock::iterator MBBI(MI);
29977 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
29978 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
29981 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
29983 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
29987 case X86::LCMPXCHG16B:
29989 case X86::LCMPXCHG8B_SAVE_EBX:
29990 case X86::LCMPXCHG16B_SAVE_RBX: {
29992 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
29993 if (!BB->isLiveIn(BasePtr))
29994 BB->addLiveIn(BasePtr);
30000 //===----------------------------------------------------------------------===//
30001 // X86 Optimization Hooks
30002 //===----------------------------------------------------------------------===//
30005 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
30006 const APInt &Demanded,
30007 TargetLoweringOpt &TLO) const {
30008 // Only optimize Ands to prevent shrinking a constant that could be
30009 // matched by movzx.
30010 if (Op.getOpcode() != ISD::AND)
30013 EVT VT = Op.getValueType();
30019 unsigned Size = VT.getSizeInBits();
30021 // Make sure the RHS really is a constant.
30022 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
30026 const APInt &Mask = C->getAPIntValue();
30028 // Clear all non-demanded bits initially.
30029 APInt ShrunkMask = Mask & Demanded;
30031 // Find the width of the shrunk mask.
30032 unsigned Width = ShrunkMask.getActiveBits();
30034 // If the mask is all 0s there's nothing to do here.
30038 // Find the next power of 2 width, rounding up to a byte.
30039 Width = PowerOf2Ceil(std::max(Width, 8U));
30040 // Truncate the width to size to handle illegal types.
30041 Width = std::min(Width, Size);
30043 // Calculate a possible zero extend mask for this constant.
30044 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
30046 // If we aren't changing the mask, just return true to keep it and prevent
30047 // the caller from optimizing.
30048 if (ZeroExtendMask == Mask)
30051 // Make sure the new mask can be represented by a combination of mask bits
30052 // and non-demanded bits.
30053 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
30056 // Replace the constant with the zero extend mask.
30058 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
30059 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
30060 return TLO.CombineTo(Op, NewOp);
30063 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
30065 const APInt &DemandedElts,
30066 const SelectionDAG &DAG,
30067 unsigned Depth) const {
30068 unsigned BitWidth = Known.getBitWidth();
30069 unsigned Opc = Op.getOpcode();
30070 EVT VT = Op.getValueType();
30071 assert((Opc >= ISD::BUILTIN_OP_END ||
30072 Opc == ISD::INTRINSIC_WO_CHAIN ||
30073 Opc == ISD::INTRINSIC_W_CHAIN ||
30074 Opc == ISD::INTRINSIC_VOID) &&
30075 "Should use MaskedValueIsZero if you don't know whether Op"
30076 " is a target node!");
30081 case X86ISD::SETCC:
30082 Known.Zero.setBitsFrom(1);
30084 case X86ISD::MOVMSK: {
30085 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
30086 Known.Zero.setBitsFrom(NumLoBits);
30089 case X86ISD::PEXTRB:
30090 case X86ISD::PEXTRW: {
30091 SDValue Src = Op.getOperand(0);
30092 EVT SrcVT = Src.getValueType();
30093 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
30094 Op.getConstantOperandVal(1));
30095 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
30096 Known = Known.zextOrTrunc(BitWidth);
30097 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
30100 case X86ISD::VSRAI:
30101 case X86ISD::VSHLI:
30102 case X86ISD::VSRLI: {
30103 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
30104 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
30105 Known.setAllZero();
30109 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
30110 unsigned ShAmt = ShiftImm->getZExtValue();
30111 if (Opc == X86ISD::VSHLI) {
30112 Known.Zero <<= ShAmt;
30113 Known.One <<= ShAmt;
30114 // Low bits are known zero.
30115 Known.Zero.setLowBits(ShAmt);
30116 } else if (Opc == X86ISD::VSRLI) {
30117 Known.Zero.lshrInPlace(ShAmt);
30118 Known.One.lshrInPlace(ShAmt);
30119 // High bits are known zero.
30120 Known.Zero.setHighBits(ShAmt);
30122 Known.Zero.ashrInPlace(ShAmt);
30123 Known.One.ashrInPlace(ShAmt);
30128 case X86ISD::PACKUS: {
30129 // PACKUS is just a truncation if the upper half is zero.
30130 APInt DemandedLHS, DemandedRHS;
30131 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
30133 Known.One = APInt::getAllOnesValue(BitWidth * 2);
30134 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
30137 if (!!DemandedLHS) {
30138 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
30139 Known.One &= Known2.One;
30140 Known.Zero &= Known2.Zero;
30142 if (!!DemandedRHS) {
30143 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
30144 Known.One &= Known2.One;
30145 Known.Zero &= Known2.Zero;
30148 if (Known.countMinLeadingZeros() < BitWidth)
30150 Known = Known.trunc(BitWidth);
30153 case X86ISD::CMOV: {
30154 Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
30155 // If we don't know any bits, early out.
30156 if (Known.isUnknown())
30158 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
30160 // Only known if known in both the LHS and RHS.
30161 Known.One &= Known2.One;
30162 Known.Zero &= Known2.Zero;
30167 // Handle target shuffles.
30168 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
30169 if (isTargetShuffle(Opc)) {
30171 SmallVector<int, 64> Mask;
30172 SmallVector<SDValue, 2> Ops;
30173 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
30175 unsigned NumOps = Ops.size();
30176 unsigned NumElts = VT.getVectorNumElements();
30177 if (Mask.size() == NumElts) {
30178 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
30179 Known.Zero.setAllBits(); Known.One.setAllBits();
30180 for (unsigned i = 0; i != NumElts; ++i) {
30181 if (!DemandedElts[i])
30184 if (M == SM_SentinelUndef) {
30185 // For UNDEF elements, we don't know anything about the common state
30186 // of the shuffle result.
30189 } else if (M == SM_SentinelZero) {
30190 Known.One.clearAllBits();
30193 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
30194 "Shuffle index out of range");
30196 unsigned OpIdx = (unsigned)M / NumElts;
30197 unsigned EltIdx = (unsigned)M % NumElts;
30198 if (Ops[OpIdx].getValueType() != VT) {
30199 // TODO - handle target shuffle ops with different value types.
30203 DemandedOps[OpIdx].setBit(EltIdx);
30205 // Known bits are the values that are shared by every demanded element.
30206 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
30207 if (!DemandedOps[i])
30210 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
30211 Known.One &= Known2.One;
30212 Known.Zero &= Known2.Zero;
30219 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
30220 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
30221 unsigned Depth) const {
30222 unsigned VTBits = Op.getScalarValueSizeInBits();
30223 unsigned Opcode = Op.getOpcode();
30225 case X86ISD::SETCC_CARRY:
30226 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
30229 case X86ISD::VTRUNC: {
30230 // TODO: Add DemandedElts support.
30231 SDValue Src = Op.getOperand(0);
30232 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
30233 assert(VTBits < NumSrcBits && "Illegal truncation input type");
30234 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
30235 if (Tmp > (NumSrcBits - VTBits))
30236 return Tmp - (NumSrcBits - VTBits);
30240 case X86ISD::PACKSS: {
30241 // PACKSS is just a truncation if the sign bits extend to the packed size.
30242 APInt DemandedLHS, DemandedRHS;
30243 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
30246 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
30247 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
30249 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
30251 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
30252 unsigned Tmp = std::min(Tmp0, Tmp1);
30253 if (Tmp > (SrcBits - VTBits))
30254 return Tmp - (SrcBits - VTBits);
30258 case X86ISD::VSHLI: {
30259 SDValue Src = Op.getOperand(0);
30260 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
30261 if (ShiftVal.uge(VTBits))
30262 return VTBits; // Shifted all bits out --> zero.
30263 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
30264 if (ShiftVal.uge(Tmp))
30265 return 1; // Shifted all sign bits out --> unknown.
30266 return Tmp - ShiftVal.getZExtValue();
30269 case X86ISD::VSRAI: {
30270 SDValue Src = Op.getOperand(0);
30271 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
30272 if (ShiftVal.uge(VTBits - 1))
30273 return VTBits; // Sign splat.
30274 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
30276 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
30279 case X86ISD::PCMPGT:
30280 case X86ISD::PCMPEQ:
30282 case X86ISD::VPCOM:
30283 case X86ISD::VPCOMU:
30284 // Vector compares return zero/all-bits result values.
30287 case X86ISD::CMOV: {
30288 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
30289 if (Tmp0 == 1) return 1; // Early out.
30290 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
30291 return std::min(Tmp0, Tmp1);
30299 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
30300 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
30301 return N->getOperand(0);
30305 // Attempt to match a combined shuffle mask against supported unary shuffle
30307 // TODO: Investigate sharing more of this with shuffle lowering.
30308 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
30309 bool AllowFloatDomain, bool AllowIntDomain,
30310 SDValue &V1, const SDLoc &DL,
30312 const X86Subtarget &Subtarget,
30313 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
30314 unsigned NumMaskElts = Mask.size();
30315 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
30317 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
30318 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
30319 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
30320 Shuffle = X86ISD::VZEXT_MOVL;
30321 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
30325 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
30326 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
30327 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
30328 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
30329 unsigned MaxScale = 64 / MaskEltSize;
30330 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
30332 unsigned NumDstElts = NumMaskElts / Scale;
30333 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
30334 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
30335 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
30338 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
30339 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
30340 MVT::getIntegerVT(MaskEltSize);
30341 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
30343 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
30344 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
30346 if (SrcVT.getVectorNumElements() == NumDstElts)
30347 Shuffle = unsigned(ISD::ZERO_EXTEND);
30349 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
30351 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
30352 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
30358 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
30359 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
30360 isUndefOrEqual(Mask[0], 0) &&
30361 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
30362 Shuffle = X86ISD::VZEXT_MOVL;
30363 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
30367 // Check if we have SSE3 which will let us use MOVDDUP etc. The
30368 // instructions are no slower than UNPCKLPD but has the option to
30369 // fold the input operand into even an unaligned memory load.
30370 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
30371 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
30372 Shuffle = X86ISD::MOVDDUP;
30373 SrcVT = DstVT = MVT::v2f64;
30376 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
30377 Shuffle = X86ISD::MOVSLDUP;
30378 SrcVT = DstVT = MVT::v4f32;
30381 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
30382 Shuffle = X86ISD::MOVSHDUP;
30383 SrcVT = DstVT = MVT::v4f32;
30388 if (MaskVT.is256BitVector() && AllowFloatDomain) {
30389 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
30390 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
30391 Shuffle = X86ISD::MOVDDUP;
30392 SrcVT = DstVT = MVT::v4f64;
30395 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
30396 Shuffle = X86ISD::MOVSLDUP;
30397 SrcVT = DstVT = MVT::v8f32;
30400 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
30401 Shuffle = X86ISD::MOVSHDUP;
30402 SrcVT = DstVT = MVT::v8f32;
30407 if (MaskVT.is512BitVector() && AllowFloatDomain) {
30408 assert(Subtarget.hasAVX512() &&
30409 "AVX512 required for 512-bit vector shuffles");
30410 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
30411 Shuffle = X86ISD::MOVDDUP;
30412 SrcVT = DstVT = MVT::v8f64;
30415 if (isTargetShuffleEquivalent(
30416 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
30417 Shuffle = X86ISD::MOVSLDUP;
30418 SrcVT = DstVT = MVT::v16f32;
30421 if (isTargetShuffleEquivalent(
30422 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
30423 Shuffle = X86ISD::MOVSHDUP;
30424 SrcVT = DstVT = MVT::v16f32;
30429 // Attempt to match against broadcast-from-vector.
30430 if (Subtarget.hasAVX2()) {
30431 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
30432 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
30433 SrcVT = DstVT = MaskVT;
30434 Shuffle = X86ISD::VBROADCAST;
30442 // Attempt to match a combined shuffle mask against supported unary immediate
30443 // permute instructions.
30444 // TODO: Investigate sharing more of this with shuffle lowering.
30445 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
30446 const APInt &Zeroable,
30447 bool AllowFloatDomain,
30448 bool AllowIntDomain,
30449 const X86Subtarget &Subtarget,
30450 unsigned &Shuffle, MVT &ShuffleVT,
30451 unsigned &PermuteImm) {
30452 unsigned NumMaskElts = Mask.size();
30453 unsigned InputSizeInBits = MaskVT.getSizeInBits();
30454 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
30455 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
30457 bool ContainsZeros =
30458 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
30460 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
30461 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
30462 // Check for lane crossing permutes.
30463 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
30464 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
30465 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
30466 Shuffle = X86ISD::VPERMI;
30467 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
30468 PermuteImm = getV4X86ShuffleImm(Mask);
30471 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
30472 SmallVector<int, 4> RepeatedMask;
30473 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
30474 Shuffle = X86ISD::VPERMI;
30475 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
30476 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
30480 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
30481 // VPERMILPD can permute with a non-repeating shuffle.
30482 Shuffle = X86ISD::VPERMILPI;
30483 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
30485 for (int i = 0, e = Mask.size(); i != e; ++i) {
30487 if (M == SM_SentinelUndef)
30489 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
30490 PermuteImm |= (M & 1) << i;
30496 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
30497 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
30498 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
30499 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
30500 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
30501 SmallVector<int, 4> RepeatedMask;
30502 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
30503 // Narrow the repeated mask to create 32-bit element permutes.
30504 SmallVector<int, 4> WordMask = RepeatedMask;
30505 if (MaskScalarSizeInBits == 64)
30506 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
30508 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
30509 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
30510 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
30511 PermuteImm = getV4X86ShuffleImm(WordMask);
30516 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
30517 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
30518 SmallVector<int, 4> RepeatedMask;
30519 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
30520 ArrayRef<int> LoMask(Mask.data() + 0, 4);
30521 ArrayRef<int> HiMask(Mask.data() + 4, 4);
30523 // PSHUFLW: permute lower 4 elements only.
30524 if (isUndefOrInRange(LoMask, 0, 4) &&
30525 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
30526 Shuffle = X86ISD::PSHUFLW;
30527 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
30528 PermuteImm = getV4X86ShuffleImm(LoMask);
30532 // PSHUFHW: permute upper 4 elements only.
30533 if (isUndefOrInRange(HiMask, 4, 8) &&
30534 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
30535 // Offset the HiMask so that we can create the shuffle immediate.
30536 int OffsetHiMask[4];
30537 for (int i = 0; i != 4; ++i)
30538 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
30540 Shuffle = X86ISD::PSHUFHW;
30541 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
30542 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
30548 // Attempt to match against byte/bit shifts.
30549 // FIXME: Add 512-bit support.
30550 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
30551 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
30552 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
30553 MaskScalarSizeInBits, Mask,
30554 0, Zeroable, Subtarget);
30555 if (0 < ShiftAmt) {
30556 PermuteImm = (unsigned)ShiftAmt;
30564 // Attempt to match a combined unary shuffle mask against supported binary
30565 // shuffle instructions.
30566 // TODO: Investigate sharing more of this with shuffle lowering.
30567 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
30568 bool AllowFloatDomain, bool AllowIntDomain,
30569 SDValue &V1, SDValue &V2, const SDLoc &DL,
30571 const X86Subtarget &Subtarget,
30572 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
30574 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
30576 if (MaskVT.is128BitVector()) {
30577 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
30579 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
30580 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
30581 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
30584 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
30586 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
30587 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
30590 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
30591 (AllowFloatDomain || !Subtarget.hasSSE41())) {
30593 Shuffle = X86ISD::MOVSD;
30594 SrcVT = DstVT = MVT::v2f64;
30597 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
30598 (AllowFloatDomain || !Subtarget.hasSSE41())) {
30599 Shuffle = X86ISD::MOVSS;
30600 SrcVT = DstVT = MVT::v4f32;
30605 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
30606 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
30607 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
30608 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
30609 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
30616 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
30617 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
30618 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
30619 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
30620 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
30621 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
30622 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
30624 SrcVT = DstVT = MaskVT;
30625 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
30626 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
30634 static bool matchBinaryPermuteVectorShuffle(
30635 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
30636 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
30637 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
30638 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
30639 unsigned NumMaskElts = Mask.size();
30640 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
30642 // Attempt to match against PALIGNR byte rotate.
30643 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
30644 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
30645 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
30646 if (0 < ByteRotation) {
30647 Shuffle = X86ISD::PALIGNR;
30648 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
30649 PermuteImm = ByteRotation;
30654 // Attempt to combine to X86ISD::BLENDI.
30655 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
30656 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
30657 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
30658 uint64_t BlendMask = 0;
30659 bool ForceV1Zero = false, ForceV2Zero = false;
30660 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
30661 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
30663 if (MaskVT == MVT::v16i16) {
30664 // We can only use v16i16 PBLENDW if the lanes are repeated.
30665 SmallVector<int, 8> RepeatedMask;
30666 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
30668 assert(RepeatedMask.size() == 8 &&
30669 "Repeated mask size doesn't match!");
30671 for (int i = 0; i < 8; ++i)
30672 if (RepeatedMask[i] >= 8)
30673 PermuteImm |= 1 << i;
30674 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
30675 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
30676 Shuffle = X86ISD::BLENDI;
30677 ShuffleVT = MaskVT;
30681 // Determine a type compatible with X86ISD::BLENDI.
30682 ShuffleVT = MaskVT;
30683 if (Subtarget.hasAVX2()) {
30684 if (ShuffleVT == MVT::v4i64)
30685 ShuffleVT = MVT::v8i32;
30686 else if (ShuffleVT == MVT::v2i64)
30687 ShuffleVT = MVT::v4i32;
30689 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
30690 ShuffleVT = MVT::v8i16;
30691 else if (ShuffleVT == MVT::v4i64)
30692 ShuffleVT = MVT::v4f64;
30693 else if (ShuffleVT == MVT::v8i32)
30694 ShuffleVT = MVT::v8f32;
30697 if (!ShuffleVT.isFloatingPoint()) {
30698 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
30700 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
30701 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
30702 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
30705 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
30706 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
30707 PermuteImm = (unsigned)BlendMask;
30708 Shuffle = X86ISD::BLENDI;
30714 // Attempt to combine to INSERTPS.
30715 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
30716 MaskVT.is128BitVector()) {
30717 if (Zeroable.getBoolValue() &&
30718 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
30719 Shuffle = X86ISD::INSERTPS;
30720 ShuffleVT = MVT::v4f32;
30725 // Attempt to combine to SHUFPD.
30726 if (AllowFloatDomain && EltSizeInBits == 64 &&
30727 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
30728 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
30729 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
30730 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
30731 Shuffle = X86ISD::SHUFP;
30732 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
30737 // Attempt to combine to SHUFPS.
30738 if (AllowFloatDomain && EltSizeInBits == 32 &&
30739 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
30740 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
30741 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
30742 SmallVector<int, 4> RepeatedMask;
30743 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
30744 // Match each half of the repeated mask, to determine if its just
30745 // referencing one of the vectors, is zeroable or entirely undef.
30746 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
30747 int M0 = RepeatedMask[Offset];
30748 int M1 = RepeatedMask[Offset + 1];
30750 if (isUndefInRange(RepeatedMask, Offset, 2)) {
30751 return DAG.getUNDEF(MaskVT);
30752 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
30753 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
30754 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
30755 return getZeroVector(MaskVT, Subtarget, DAG, DL);
30756 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
30757 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
30758 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
30760 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
30761 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
30762 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
30769 int ShufMask[4] = {-1, -1, -1, -1};
30770 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
30771 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
30776 Shuffle = X86ISD::SHUFP;
30777 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
30778 PermuteImm = getV4X86ShuffleImm(ShufMask);
30787 /// Combine an arbitrary chain of shuffles into a single instruction if
30790 /// This is the leaf of the recursive combine below. When we have found some
30791 /// chain of single-use x86 shuffle instructions and accumulated the combined
30792 /// shuffle mask represented by them, this will try to pattern match that mask
30793 /// into either a single instruction if there is a special purpose instruction
30794 /// for this operation, or into a PSHUFB instruction which is a fully general
30795 /// instruction but should only be used to replace chains over a certain depth.
30796 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
30797 ArrayRef<int> BaseMask, int Depth,
30798 bool HasVariableMask,
30799 bool AllowVariableMask, SelectionDAG &DAG,
30800 const X86Subtarget &Subtarget) {
30801 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
30802 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
30803 "Unexpected number of shuffle inputs!");
30805 // Find the inputs that enter the chain. Note that multiple uses are OK
30806 // here, we're not going to remove the operands we find.
30807 bool UnaryShuffle = (Inputs.size() == 1);
30808 SDValue V1 = peekThroughBitcasts(Inputs[0]);
30809 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
30810 : peekThroughBitcasts(Inputs[1]));
30812 MVT VT1 = V1.getSimpleValueType();
30813 MVT VT2 = V2.getSimpleValueType();
30814 MVT RootVT = Root.getSimpleValueType();
30815 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
30816 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
30817 "Vector size mismatch");
30822 unsigned NumBaseMaskElts = BaseMask.size();
30823 if (NumBaseMaskElts == 1) {
30824 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
30825 return DAG.getBitcast(RootVT, V1);
30828 unsigned RootSizeInBits = RootVT.getSizeInBits();
30829 unsigned NumRootElts = RootVT.getVectorNumElements();
30830 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
30831 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
30832 (RootVT.isFloatingPoint() && Depth >= 2) ||
30833 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
30835 // Don't combine if we are a AVX512/EVEX target and the mask element size
30836 // is different from the root element size - this would prevent writemasks
30837 // from being reused.
30838 // TODO - this currently prevents all lane shuffles from occurring.
30839 // TODO - check for writemasks usage instead of always preventing combining.
30840 // TODO - attempt to narrow Mask back to writemask size.
30841 bool IsEVEXShuffle =
30842 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
30844 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
30846 // Handle 128-bit lane shuffles of 256-bit vectors.
30847 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
30848 // we need to use the zeroing feature.
30849 // TODO - this should support binary shuffles.
30850 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
30851 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
30852 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
30853 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
30854 return SDValue(); // Nothing to do!
30855 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
30856 unsigned PermMask = 0;
30857 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
30858 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
30860 Res = DAG.getBitcast(ShuffleVT, V1);
30861 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
30862 DAG.getUNDEF(ShuffleVT),
30863 DAG.getConstant(PermMask, DL, MVT::i8));
30864 return DAG.getBitcast(RootVT, Res);
30867 // For masks that have been widened to 128-bit elements or more,
30868 // narrow back down to 64-bit elements.
30869 SmallVector<int, 64> Mask;
30870 if (BaseMaskEltSizeInBits > 64) {
30871 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
30872 int MaskScale = BaseMaskEltSizeInBits / 64;
30873 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
30875 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
30878 unsigned NumMaskElts = Mask.size();
30879 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
30881 // Determine the effective mask value type.
30882 FloatDomain &= (32 <= MaskEltSizeInBits);
30883 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
30884 : MVT::getIntegerVT(MaskEltSizeInBits);
30885 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
30887 // Only allow legal mask types.
30888 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
30891 // Attempt to match the mask against known shuffle patterns.
30892 MVT ShuffleSrcVT, ShuffleVT;
30893 unsigned Shuffle, PermuteImm;
30895 // Which shuffle domains are permitted?
30896 // Permit domain crossing at higher combine depths.
30897 bool AllowFloatDomain = FloatDomain || (Depth > 3);
30898 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
30899 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
30901 // Determine zeroable mask elements.
30902 APInt Zeroable(NumMaskElts, 0);
30903 for (unsigned i = 0; i != NumMaskElts; ++i)
30904 if (isUndefOrZero(Mask[i]))
30905 Zeroable.setBit(i);
30907 if (UnaryShuffle) {
30908 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
30909 // directly if we don't shuffle the lower element and we shuffle the upper
30910 // (zero) elements within themselves.
30911 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
30912 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
30913 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
30914 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
30915 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
30916 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
30917 return DAG.getBitcast(RootVT, V1);
30921 SDValue NewV1 = V1; // Save operand in case early exit happens.
30922 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
30923 NewV1, DL, DAG, Subtarget, Shuffle,
30924 ShuffleSrcVT, ShuffleVT) &&
30925 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
30926 if (Depth == 1 && Root.getOpcode() == Shuffle)
30927 return SDValue(); // Nothing to do!
30928 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
30929 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
30930 return DAG.getBitcast(RootVT, Res);
30933 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
30934 AllowIntDomain, Subtarget, Shuffle,
30935 ShuffleVT, PermuteImm) &&
30936 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
30937 if (Depth == 1 && Root.getOpcode() == Shuffle)
30938 return SDValue(); // Nothing to do!
30939 Res = DAG.getBitcast(ShuffleVT, V1);
30940 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
30941 DAG.getConstant(PermuteImm, DL, MVT::i8));
30942 return DAG.getBitcast(RootVT, Res);
30946 SDValue NewV1 = V1; // Save operands in case early exit happens.
30947 SDValue NewV2 = V2;
30948 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
30949 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
30950 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
30951 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
30952 if (Depth == 1 && Root.getOpcode() == Shuffle)
30953 return SDValue(); // Nothing to do!
30954 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
30955 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
30956 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
30957 return DAG.getBitcast(RootVT, Res);
30960 NewV1 = V1; // Save operands in case early exit happens.
30962 if (matchBinaryPermuteVectorShuffle(
30963 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
30964 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
30965 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
30966 if (Depth == 1 && Root.getOpcode() == Shuffle)
30967 return SDValue(); // Nothing to do!
30968 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
30969 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
30970 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
30971 DAG.getConstant(PermuteImm, DL, MVT::i8));
30972 return DAG.getBitcast(RootVT, Res);
30975 // Typically from here on, we need an integer version of MaskVT.
30976 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
30977 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
30979 // Annoyingly, SSE4A instructions don't map into the above match helpers.
30980 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
30981 uint64_t BitLen, BitIdx;
30982 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
30984 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
30985 return SDValue(); // Nothing to do!
30986 V1 = DAG.getBitcast(IntMaskVT, V1);
30987 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
30988 DAG.getConstant(BitLen, DL, MVT::i8),
30989 DAG.getConstant(BitIdx, DL, MVT::i8));
30990 return DAG.getBitcast(RootVT, Res);
30993 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
30994 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
30995 return SDValue(); // Nothing to do!
30996 V1 = DAG.getBitcast(IntMaskVT, V1);
30997 V2 = DAG.getBitcast(IntMaskVT, V2);
30998 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
30999 DAG.getConstant(BitLen, DL, MVT::i8),
31000 DAG.getConstant(BitIdx, DL, MVT::i8));
31001 return DAG.getBitcast(RootVT, Res);
31005 // Don't try to re-form single instruction chains under any circumstances now
31006 // that we've done encoding canonicalization for them.
31010 // Depth threshold above which we can efficiently use variable mask shuffles.
31011 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
31012 AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
31014 bool MaskContainsZeros =
31015 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
31017 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
31018 // If we have a single input lane-crossing shuffle then lower to VPERMV.
31019 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
31020 ((Subtarget.hasAVX2() &&
31021 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
31022 (Subtarget.hasAVX512() &&
31023 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
31024 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
31025 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
31026 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
31027 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
31028 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
31029 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
31030 Res = DAG.getBitcast(MaskVT, V1);
31031 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
31032 return DAG.getBitcast(RootVT, Res);
31035 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
31036 // vector as the second source.
31037 if (UnaryShuffle && AllowVariableMask &&
31038 ((Subtarget.hasAVX512() &&
31039 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
31040 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
31041 (Subtarget.hasVLX() &&
31042 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
31043 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
31044 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
31045 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
31046 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
31047 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
31048 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
31049 for (unsigned i = 0; i != NumMaskElts; ++i)
31050 if (Mask[i] == SM_SentinelZero)
31051 Mask[i] = NumMaskElts + i;
31053 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
31054 Res = DAG.getBitcast(MaskVT, V1);
31055 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
31056 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
31057 return DAG.getBitcast(RootVT, Res);
31060 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
31061 if (AllowVariableMask && !MaskContainsZeros &&
31062 ((Subtarget.hasAVX512() &&
31063 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
31064 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
31065 (Subtarget.hasVLX() &&
31066 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
31067 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
31068 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
31069 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
31070 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
31071 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
31072 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
31073 V1 = DAG.getBitcast(MaskVT, V1);
31074 V2 = DAG.getBitcast(MaskVT, V2);
31075 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
31076 return DAG.getBitcast(RootVT, Res);
31081 // See if we can combine a single input shuffle with zeros to a bit-mask,
31082 // which is much simpler than any shuffle.
31083 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
31084 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
31085 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
31086 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
31087 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
31088 APInt UndefElts(NumMaskElts, 0);
31089 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
31090 for (unsigned i = 0; i != NumMaskElts; ++i) {
31092 if (M == SM_SentinelUndef) {
31093 UndefElts.setBit(i);
31096 if (M == SM_SentinelZero)
31098 EltBits[i] = AllOnes;
31100 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
31101 Res = DAG.getBitcast(MaskVT, V1);
31102 unsigned AndOpcode =
31103 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
31104 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
31105 return DAG.getBitcast(RootVT, Res);
31108 // If we have a single input shuffle with different shuffle patterns in the
31109 // the 128-bit lanes use the variable mask to VPERMILPS.
31110 // TODO Combine other mask types at higher depths.
31111 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
31112 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
31113 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
31114 SmallVector<SDValue, 16> VPermIdx;
31115 for (int M : Mask) {
31117 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
31118 VPermIdx.push_back(Idx);
31120 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
31121 Res = DAG.getBitcast(MaskVT, V1);
31122 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
31123 return DAG.getBitcast(RootVT, Res);
31126 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
31127 // to VPERMIL2PD/VPERMIL2PS.
31128 if (AllowVariableMask && Subtarget.hasXOP() &&
31129 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
31130 MaskVT == MVT::v8f32)) {
31131 // VPERMIL2 Operation.
31132 // Bits[3] - Match Bit.
31133 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
31134 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
31135 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
31136 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
31137 SmallVector<int, 8> VPerm2Idx;
31138 unsigned M2ZImm = 0;
31139 for (int M : Mask) {
31140 if (M == SM_SentinelUndef) {
31141 VPerm2Idx.push_back(-1);
31144 if (M == SM_SentinelZero) {
31146 VPerm2Idx.push_back(8);
31149 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
31150 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
31151 VPerm2Idx.push_back(Index);
31153 V1 = DAG.getBitcast(MaskVT, V1);
31154 V2 = DAG.getBitcast(MaskVT, V2);
31155 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
31156 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
31157 DAG.getConstant(M2ZImm, DL, MVT::i8));
31158 return DAG.getBitcast(RootVT, Res);
31161 // If we have 3 or more shuffle instructions or a chain involving a variable
31162 // mask, we can replace them with a single PSHUFB instruction profitably.
31163 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
31164 // instructions, but in practice PSHUFB tends to be *very* fast so we're
31165 // more aggressive.
31166 if (UnaryShuffle && AllowVariableMask &&
31167 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
31168 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
31169 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
31170 SmallVector<SDValue, 16> PSHUFBMask;
31171 int NumBytes = RootVT.getSizeInBits() / 8;
31172 int Ratio = NumBytes / NumMaskElts;
31173 for (int i = 0; i < NumBytes; ++i) {
31174 int M = Mask[i / Ratio];
31175 if (M == SM_SentinelUndef) {
31176 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
31179 if (M == SM_SentinelZero) {
31180 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
31183 M = Ratio * M + i % Ratio;
31184 assert((M / 16) == (i / 16) && "Lane crossing detected");
31185 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
31187 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
31188 Res = DAG.getBitcast(ByteVT, V1);
31189 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
31190 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
31191 return DAG.getBitcast(RootVT, Res);
31194 // With XOP, if we have a 128-bit binary input shuffle we can always combine
31195 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
31196 // slower than PSHUFB on targets that support both.
31197 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
31198 // VPPERM Mask Operation
31199 // Bits[4:0] - Byte Index (0 - 31)
31200 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
31201 SmallVector<SDValue, 16> VPPERMMask;
31203 int Ratio = NumBytes / NumMaskElts;
31204 for (int i = 0; i < NumBytes; ++i) {
31205 int M = Mask[i / Ratio];
31206 if (M == SM_SentinelUndef) {
31207 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
31210 if (M == SM_SentinelZero) {
31211 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
31214 M = Ratio * M + i % Ratio;
31215 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
31217 MVT ByteVT = MVT::v16i8;
31218 V1 = DAG.getBitcast(ByteVT, V1);
31219 V2 = DAG.getBitcast(ByteVT, V2);
31220 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
31221 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
31222 return DAG.getBitcast(RootVT, Res);
31225 // Failed to find any combines.
31229 // Attempt to constant fold all of the constant source ops.
31230 // Returns true if the entire shuffle is folded to a constant.
31231 // TODO: Extend this to merge multiple constant Ops and update the mask.
31232 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
31233 ArrayRef<int> Mask, SDValue Root,
31234 bool HasVariableMask,
31236 const X86Subtarget &Subtarget) {
31237 MVT VT = Root.getSimpleValueType();
31239 unsigned SizeInBits = VT.getSizeInBits();
31240 unsigned NumMaskElts = Mask.size();
31241 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
31242 unsigned NumOps = Ops.size();
31244 // Extract constant bits from each source op.
31245 bool OneUseConstantOp = false;
31246 SmallVector<APInt, 16> UndefEltsOps(NumOps);
31247 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
31248 for (unsigned i = 0; i != NumOps; ++i) {
31249 SDValue SrcOp = Ops[i];
31250 OneUseConstantOp |= SrcOp.hasOneUse();
31251 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
31256 // Only fold if at least one of the constants is only used once or
31257 // the combined shuffle has included a variable mask shuffle, this
31258 // is to avoid constant pool bloat.
31259 if (!OneUseConstantOp && !HasVariableMask)
31262 // Shuffle the constant bits according to the mask.
31263 APInt UndefElts(NumMaskElts, 0);
31264 APInt ZeroElts(NumMaskElts, 0);
31265 APInt ConstantElts(NumMaskElts, 0);
31266 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
31267 APInt::getNullValue(MaskSizeInBits));
31268 for (unsigned i = 0; i != NumMaskElts; ++i) {
31270 if (M == SM_SentinelUndef) {
31271 UndefElts.setBit(i);
31273 } else if (M == SM_SentinelZero) {
31274 ZeroElts.setBit(i);
31277 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
31279 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
31280 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
31282 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
31283 if (SrcUndefElts[SrcMaskIdx]) {
31284 UndefElts.setBit(i);
31288 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
31289 APInt &Bits = SrcEltBits[SrcMaskIdx];
31291 ZeroElts.setBit(i);
31295 ConstantElts.setBit(i);
31296 ConstantBitData[i] = Bits;
31298 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
31300 // Create the constant data.
31302 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
31303 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
31305 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
31307 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
31310 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
31311 return DAG.getBitcast(VT, CstOp);
31314 /// Fully generic combining of x86 shuffle instructions.
31316 /// This should be the last combine run over the x86 shuffle instructions. Once
31317 /// they have been fully optimized, this will recursively consider all chains
31318 /// of single-use shuffle instructions, build a generic model of the cumulative
31319 /// shuffle operation, and check for simpler instructions which implement this
31320 /// operation. We use this primarily for two purposes:
31322 /// 1) Collapse generic shuffles to specialized single instructions when
31323 /// equivalent. In most cases, this is just an encoding size win, but
31324 /// sometimes we will collapse multiple generic shuffles into a single
31325 /// special-purpose shuffle.
31326 /// 2) Look for sequences of shuffle instructions with 3 or more total
31327 /// instructions, and replace them with the slightly more expensive SSSE3
31328 /// PSHUFB instruction if available. We do this as the last combining step
31329 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
31330 /// a suitable short sequence of other instructions. The PSHUFB will either
31331 /// use a register or have to read from memory and so is slightly (but only
31332 /// slightly) more expensive than the other shuffle instructions.
31334 /// Because this is inherently a quadratic operation (for each shuffle in
31335 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
31336 /// This should never be an issue in practice as the shuffle lowering doesn't
31337 /// produce sequences of more than 8 instructions.
31339 /// FIXME: We will currently miss some cases where the redundant shuffling
31340 /// would simplify under the threshold for PSHUFB formation because of
31341 /// combine-ordering. To fix this, we should do the redundant instruction
31342 /// combining in this recursive walk.
31343 static SDValue combineX86ShufflesRecursively(
31344 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
31345 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
31346 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
31347 const X86Subtarget &Subtarget) {
31348 // Bound the depth of our recursive combine because this is ultimately
31349 // quadratic in nature.
31350 const unsigned MaxRecursionDepth = 8;
31351 if (Depth > MaxRecursionDepth)
31354 // Directly rip through bitcasts to find the underlying operand.
31355 SDValue Op = SrcOps[SrcOpIndex];
31356 Op = peekThroughOneUseBitcasts(Op);
31358 MVT VT = Op.getSimpleValueType();
31359 if (!VT.isVector())
31360 return SDValue(); // Bail if we hit a non-vector.
31362 assert(Root.getSimpleValueType().isVector() &&
31363 "Shuffles operate on vector types!");
31364 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
31365 "Can only combine shuffles of the same vector register size.");
31367 // Extract target shuffle mask and resolve sentinels and inputs.
31368 SmallVector<int, 64> OpMask;
31369 SmallVector<SDValue, 2> OpInputs;
31370 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
31373 // TODO - Add support for more than 2 inputs.
31374 if (2 < OpInputs.size())
31377 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
31378 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
31380 // Add the inputs to the Ops list, avoiding duplicates.
31381 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
31383 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
31386 // Attempt to find an existing match.
31387 SDValue InputBC = peekThroughBitcasts(Input);
31388 for (int i = 0, e = Ops.size(); i < e; ++i)
31389 if (InputBC == peekThroughBitcasts(Ops[i]))
31391 // Match failed - should we replace an existing Op?
31392 if (InsertionPoint >= 0) {
31393 Ops[InsertionPoint] = Input;
31394 return InsertionPoint;
31396 // Add to the end of the Ops list.
31397 Ops.push_back(Input);
31398 return Ops.size() - 1;
31401 int InputIdx0 = AddOp(Input0, SrcOpIndex);
31402 int InputIdx1 = AddOp(Input1, -1);
31404 assert(((RootMask.size() > OpMask.size() &&
31405 RootMask.size() % OpMask.size() == 0) ||
31406 (OpMask.size() > RootMask.size() &&
31407 OpMask.size() % RootMask.size() == 0) ||
31408 OpMask.size() == RootMask.size()) &&
31409 "The smaller number of elements must divide the larger.");
31411 // This function can be performance-critical, so we rely on the power-of-2
31412 // knowledge that we have about the mask sizes to replace div/rem ops with
31413 // bit-masks and shifts.
31414 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
31415 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
31416 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
31417 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
31419 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
31420 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
31421 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
31422 assert((RootRatio == 1 || OpRatio == 1) &&
31423 "Must not have a ratio for both incoming and op masks!");
31425 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
31426 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
31427 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
31428 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
31429 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
31431 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
31433 // Merge this shuffle operation's mask into our accumulated mask. Note that
31434 // this shuffle's mask will be the first applied to the input, followed by the
31435 // root mask to get us all the way to the root value arrangement. The reason
31436 // for this order is that we are recursing up the operation chain.
31437 for (unsigned i = 0; i < MaskWidth; ++i) {
31438 unsigned RootIdx = i >> RootRatioLog2;
31439 if (RootMask[RootIdx] < 0) {
31440 // This is a zero or undef lane, we're done.
31441 Mask[i] = RootMask[RootIdx];
31445 unsigned RootMaskedIdx =
31447 ? RootMask[RootIdx]
31448 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
31450 // Just insert the scaled root mask value if it references an input other
31451 // than the SrcOp we're currently inserting.
31452 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
31453 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
31454 Mask[i] = RootMaskedIdx;
31458 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
31459 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
31460 if (OpMask[OpIdx] < 0) {
31461 // The incoming lanes are zero or undef, it doesn't matter which ones we
31463 Mask[i] = OpMask[OpIdx];
31467 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
31468 unsigned OpMaskedIdx =
31471 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
31473 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
31474 if (OpMask[OpIdx] < (int)OpMask.size()) {
31475 assert(0 <= InputIdx0 && "Unknown target shuffle input");
31476 OpMaskedIdx += InputIdx0 * MaskWidth;
31478 assert(0 <= InputIdx1 && "Unknown target shuffle input");
31479 OpMaskedIdx += InputIdx1 * MaskWidth;
31482 Mask[i] = OpMaskedIdx;
31485 // Handle the all undef/zero cases early.
31486 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
31487 return DAG.getUNDEF(Root.getValueType());
31489 // TODO - should we handle the mixed zero/undef case as well? Just returning
31490 // a zero mask will lose information on undef elements possibly reducing
31491 // future combine possibilities.
31492 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
31493 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
31496 // Remove unused shuffle source ops.
31497 resolveTargetShuffleInputsAndMask(Ops, Mask);
31498 assert(!Ops.empty() && "Shuffle with no inputs detected");
31500 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
31502 // Update the list of shuffle nodes that have been combined so far.
31503 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
31505 CombinedNodes.push_back(Op.getNode());
31507 // See if we can recurse into each shuffle source op (if it's a target
31508 // shuffle). The source op should only be generally combined if it either has
31509 // a single use (i.e. current Op) or all its users have already been combined,
31510 // if not then we can still combine but should prevent generation of variable
31511 // shuffles to avoid constant pool bloat.
31512 // Don't recurse if we already have more source ops than we can combine in
31513 // the remaining recursion depth.
31514 if (Ops.size() < (MaxRecursionDepth - Depth)) {
31515 for (int i = 0, e = Ops.size(); i < e; ++i) {
31516 bool AllowVar = false;
31517 if (Ops[i].getNode()->hasOneUse() ||
31518 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
31519 AllowVar = AllowVariableMask;
31520 if (SDValue Res = combineX86ShufflesRecursively(
31521 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
31522 AllowVar, DAG, Subtarget))
31527 // Attempt to constant fold all of the constant source ops.
31528 if (SDValue Cst = combineX86ShufflesConstants(
31529 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
31532 // We can only combine unary and binary shuffle mask cases.
31533 if (Ops.size() > 2)
31536 // Minor canonicalization of the accumulated shuffle mask to make it easier
31537 // to match below. All this does is detect masks with sequential pairs of
31538 // elements, and shrink them to the half-width mask. It does this in a loop
31539 // so it will reduce the size of the mask to the minimal width mask which
31540 // performs an equivalent shuffle.
31541 SmallVector<int, 64> WidenedMask;
31542 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
31543 Mask = std::move(WidenedMask);
31546 // Canonicalization of binary shuffle masks to improve pattern matching by
31547 // commuting the inputs.
31548 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
31549 ShuffleVectorSDNode::commuteMask(Mask);
31550 std::swap(Ops[0], Ops[1]);
31553 // Finally, try to combine into a single shuffle instruction.
31554 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
31555 AllowVariableMask, DAG, Subtarget);
31558 /// Get the PSHUF-style mask from PSHUF node.
31560 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
31561 /// PSHUF-style masks that can be reused with such instructions.
31562 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
31563 MVT VT = N.getSimpleValueType();
31564 SmallVector<int, 4> Mask;
31565 SmallVector<SDValue, 2> Ops;
31568 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
31572 // If we have more than 128-bits, only the low 128-bits of shuffle mask
31573 // matter. Check that the upper masks are repeats and remove them.
31574 if (VT.getSizeInBits() > 128) {
31575 int LaneElts = 128 / VT.getScalarSizeInBits();
31577 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
31578 for (int j = 0; j < LaneElts; ++j)
31579 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
31580 "Mask doesn't repeat in high 128-bit lanes!");
31582 Mask.resize(LaneElts);
31585 switch (N.getOpcode()) {
31586 case X86ISD::PSHUFD:
31588 case X86ISD::PSHUFLW:
31591 case X86ISD::PSHUFHW:
31592 Mask.erase(Mask.begin(), Mask.begin() + 4);
31593 for (int &M : Mask)
31597 llvm_unreachable("No valid shuffle instruction found!");
31601 /// Search for a combinable shuffle across a chain ending in pshufd.
31603 /// We walk up the chain and look for a combinable shuffle, skipping over
31604 /// shuffles that we could hoist this shuffle's transformation past without
31605 /// altering anything.
31607 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
31608 SelectionDAG &DAG) {
31609 assert(N.getOpcode() == X86ISD::PSHUFD &&
31610 "Called with something other than an x86 128-bit half shuffle!");
31613 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
31614 // of the shuffles in the chain so that we can form a fresh chain to replace
31616 SmallVector<SDValue, 8> Chain;
31617 SDValue V = N.getOperand(0);
31618 for (; V.hasOneUse(); V = V.getOperand(0)) {
31619 switch (V.getOpcode()) {
31621 return SDValue(); // Nothing combined!
31624 // Skip bitcasts as we always know the type for the target specific
31628 case X86ISD::PSHUFD:
31629 // Found another dword shuffle.
31632 case X86ISD::PSHUFLW:
31633 // Check that the low words (being shuffled) are the identity in the
31634 // dword shuffle, and the high words are self-contained.
31635 if (Mask[0] != 0 || Mask[1] != 1 ||
31636 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
31639 Chain.push_back(V);
31642 case X86ISD::PSHUFHW:
31643 // Check that the high words (being shuffled) are the identity in the
31644 // dword shuffle, and the low words are self-contained.
31645 if (Mask[2] != 2 || Mask[3] != 3 ||
31646 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
31649 Chain.push_back(V);
31652 case X86ISD::UNPCKL:
31653 case X86ISD::UNPCKH:
31654 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
31655 // shuffle into a preceding word shuffle.
31656 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
31657 V.getSimpleValueType().getVectorElementType() != MVT::i16)
31660 // Search for a half-shuffle which we can combine with.
31661 unsigned CombineOp =
31662 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
31663 if (V.getOperand(0) != V.getOperand(1) ||
31664 !V->isOnlyUserOf(V.getOperand(0).getNode()))
31666 Chain.push_back(V);
31667 V = V.getOperand(0);
31669 switch (V.getOpcode()) {
31671 return SDValue(); // Nothing to combine.
31673 case X86ISD::PSHUFLW:
31674 case X86ISD::PSHUFHW:
31675 if (V.getOpcode() == CombineOp)
31678 Chain.push_back(V);
31682 V = V.getOperand(0);
31686 } while (V.hasOneUse());
31689 // Break out of the loop if we break out of the switch.
31693 if (!V.hasOneUse())
31694 // We fell out of the loop without finding a viable combining instruction.
31697 // Merge this node's mask and our incoming mask.
31698 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
31699 for (int &M : Mask)
31701 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
31702 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
31704 // Rebuild the chain around this new shuffle.
31705 while (!Chain.empty()) {
31706 SDValue W = Chain.pop_back_val();
31708 if (V.getValueType() != W.getOperand(0).getValueType())
31709 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
31711 switch (W.getOpcode()) {
31713 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
31715 case X86ISD::UNPCKL:
31716 case X86ISD::UNPCKH:
31717 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
31720 case X86ISD::PSHUFD:
31721 case X86ISD::PSHUFLW:
31722 case X86ISD::PSHUFHW:
31723 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
31727 if (V.getValueType() != N.getValueType())
31728 V = DAG.getBitcast(N.getValueType(), V);
31730 // Return the new chain to replace N.
31734 /// Try to combine x86 target specific shuffles.
31735 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
31736 TargetLowering::DAGCombinerInfo &DCI,
31737 const X86Subtarget &Subtarget) {
31739 MVT VT = N.getSimpleValueType();
31740 SmallVector<int, 4> Mask;
31741 unsigned Opcode = N.getOpcode();
31743 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
31744 // single instruction.
31745 if (VT.getScalarSizeInBits() == 64 &&
31746 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
31747 Opcode == X86ISD::UNPCKL)) {
31748 auto BC0 = peekThroughBitcasts(N.getOperand(0));
31749 auto BC1 = peekThroughBitcasts(N.getOperand(1));
31750 EVT VT0 = BC0.getValueType();
31751 EVT VT1 = BC1.getValueType();
31752 unsigned Opcode0 = BC0.getOpcode();
31753 unsigned Opcode1 = BC1.getOpcode();
31754 if (Opcode0 == Opcode1 && VT0 == VT1 &&
31755 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
31756 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
31757 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
31759 if (Opcode == X86ISD::MOVSD) {
31760 Lo = BC1.getOperand(0);
31761 Hi = BC0.getOperand(1);
31763 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
31764 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
31766 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
31767 return DAG.getBitcast(VT, Horiz);
31772 case X86ISD::VBROADCAST: {
31773 // If broadcasting from another shuffle, attempt to simplify it.
31774 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
31775 SDValue Src = N.getOperand(0);
31776 SDValue BC = peekThroughBitcasts(Src);
31777 EVT SrcVT = Src.getValueType();
31778 EVT BCVT = BC.getValueType();
31779 if (isTargetShuffle(BC.getOpcode()) &&
31780 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
31781 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
31782 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
31784 for (unsigned i = 0; i != Scale; ++i)
31785 DemandedMask[i] = i;
31786 if (SDValue Res = combineX86ShufflesRecursively(
31787 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
31788 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
31789 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
31790 DAG.getBitcast(SrcVT, Res));
31794 case X86ISD::PSHUFD:
31795 case X86ISD::PSHUFLW:
31796 case X86ISD::PSHUFHW:
31797 Mask = getPSHUFShuffleMask(N);
31798 assert(Mask.size() == 4);
31800 case X86ISD::MOVSD:
31801 case X86ISD::MOVSS: {
31802 SDValue N0 = N.getOperand(0);
31803 SDValue N1 = N.getOperand(1);
31805 // Canonicalize scalar FPOps:
31806 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
31807 // If commutable, allow OP(N1[0], N0[0]).
31808 unsigned Opcode1 = N1.getOpcode();
31809 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
31810 Opcode1 == ISD::FDIV) {
31811 SDValue N10 = N1.getOperand(0);
31812 SDValue N11 = N1.getOperand(1);
31814 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
31816 std::swap(N10, N11);
31817 MVT SVT = VT.getVectorElementType();
31818 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
31819 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
31820 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
31821 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
31822 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
31823 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
31829 case X86ISD::INSERTPS: {
31830 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
31831 SDValue Op0 = N.getOperand(0);
31832 SDValue Op1 = N.getOperand(1);
31833 SDValue Op2 = N.getOperand(2);
31834 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
31835 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
31836 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
31837 unsigned ZeroMask = InsertPSMask & 0xF;
31839 // If we zero out all elements from Op0 then we don't need to reference it.
31840 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
31841 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
31842 DAG.getConstant(InsertPSMask, DL, MVT::i8));
31844 // If we zero out the element from Op1 then we don't need to reference it.
31845 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
31846 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
31847 DAG.getConstant(InsertPSMask, DL, MVT::i8));
31849 // Attempt to merge insertps Op1 with an inner target shuffle node.
31850 SmallVector<int, 8> TargetMask1;
31851 SmallVector<SDValue, 2> Ops1;
31852 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
31853 int M = TargetMask1[SrcIdx];
31854 if (isUndefOrZero(M)) {
31855 // Zero/UNDEF insertion - zero out element and remove dependency.
31856 InsertPSMask |= (1u << DstIdx);
31857 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
31858 DAG.getConstant(InsertPSMask, DL, MVT::i8));
31860 // Update insertps mask srcidx and reference the source input directly.
31861 assert(0 <= M && M < 8 && "Shuffle index out of range");
31862 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
31863 Op1 = Ops1[M < 4 ? 0 : 1];
31864 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
31865 DAG.getConstant(InsertPSMask, DL, MVT::i8));
31868 // Attempt to merge insertps Op0 with an inner target shuffle node.
31869 SmallVector<int, 8> TargetMask0;
31870 SmallVector<SDValue, 2> Ops0;
31871 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
31874 bool Updated = false;
31875 bool UseInput00 = false;
31876 bool UseInput01 = false;
31877 for (int i = 0; i != 4; ++i) {
31878 int M = TargetMask0[i];
31879 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
31880 // No change if element is already zero or the inserted element.
31882 } else if (isUndefOrZero(M)) {
31883 // If the target mask is undef/zero then we must zero the element.
31884 InsertPSMask |= (1u << i);
31889 // The input vector element must be inline.
31890 if (M != i && M != (i + 4))
31893 // Determine which inputs of the target shuffle we're using.
31894 UseInput00 |= (0 <= M && M < 4);
31895 UseInput01 |= (4 <= M);
31898 // If we're not using both inputs of the target shuffle then use the
31899 // referenced input directly.
31900 if (UseInput00 && !UseInput01) {
31903 } else if (!UseInput00 && UseInput01) {
31909 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
31910 DAG.getConstant(InsertPSMask, DL, MVT::i8));
31918 // Nuke no-op shuffles that show up after combining.
31919 if (isNoopShuffleMask(Mask))
31920 return N.getOperand(0);
31922 // Look for simplifications involving one or two shuffle instructions.
31923 SDValue V = N.getOperand(0);
31924 switch (N.getOpcode()) {
31927 case X86ISD::PSHUFLW:
31928 case X86ISD::PSHUFHW:
31929 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
31931 // See if this reduces to a PSHUFD which is no more expensive and can
31932 // combine with more operations. Note that it has to at least flip the
31933 // dwords as otherwise it would have been removed as a no-op.
31934 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
31935 int DMask[] = {0, 1, 2, 3};
31936 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
31937 DMask[DOffset + 0] = DOffset + 1;
31938 DMask[DOffset + 1] = DOffset + 0;
31939 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
31940 V = DAG.getBitcast(DVT, V);
31941 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
31942 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
31943 return DAG.getBitcast(VT, V);
31946 // Look for shuffle patterns which can be implemented as a single unpack.
31947 // FIXME: This doesn't handle the location of the PSHUFD generically, and
31948 // only works when we have a PSHUFD followed by two half-shuffles.
31949 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
31950 (V.getOpcode() == X86ISD::PSHUFLW ||
31951 V.getOpcode() == X86ISD::PSHUFHW) &&
31952 V.getOpcode() != N.getOpcode() &&
31954 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
31955 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
31956 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
31957 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
31958 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
31959 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
31961 for (int i = 0; i < 4; ++i) {
31962 WordMask[i + NOffset] = Mask[i] + NOffset;
31963 WordMask[i + VOffset] = VMask[i] + VOffset;
31965 // Map the word mask through the DWord mask.
31967 for (int i = 0; i < 8; ++i)
31968 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
31969 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
31970 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
31971 // We can replace all three shuffles with an unpack.
31972 V = DAG.getBitcast(VT, D.getOperand(0));
31973 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
31982 case X86ISD::PSHUFD:
31983 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
31992 /// Checks if the shuffle mask takes subsequent elements
31993 /// alternately from two vectors.
31994 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
31995 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
31997 int ParitySrc[2] = {-1, -1};
31998 unsigned Size = Mask.size();
31999 for (unsigned i = 0; i != Size; ++i) {
32004 // Make sure we are using the matching element from the input.
32005 if ((M % Size) != i)
32008 // Make sure we use the same input for all elements of the same parity.
32009 int Src = M / Size;
32010 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
32012 ParitySrc[i % 2] = Src;
32015 // Make sure each input is used.
32016 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
32019 Op0Even = ParitySrc[0] == 0;
32023 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
32024 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
32025 /// are written to the parameters \p Opnd0 and \p Opnd1.
32027 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
32028 /// so it is easier to generically match. We also insert dummy vector shuffle
32029 /// nodes for the operands which explicitly discard the lanes which are unused
32030 /// by this operation to try to flow through the rest of the combiner
32031 /// the fact that they're unused.
32032 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
32033 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
32036 EVT VT = N->getValueType(0);
32037 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32038 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
32039 !VT.getSimpleVT().isFloatingPoint())
32042 // We only handle target-independent shuffles.
32043 // FIXME: It would be easy and harmless to use the target shuffle mask
32044 // extraction tool to support more.
32045 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
32048 SDValue V1 = N->getOperand(0);
32049 SDValue V2 = N->getOperand(1);
32051 // Make sure we have an FADD and an FSUB.
32052 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
32053 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
32054 V1.getOpcode() == V2.getOpcode())
32057 // If there are other uses of these operations we can't fold them.
32058 if (!V1->hasOneUse() || !V2->hasOneUse())
32061 // Ensure that both operations have the same operands. Note that we can
32062 // commute the FADD operands.
32064 if (V1.getOpcode() == ISD::FSUB) {
32065 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
32066 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
32067 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
32070 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
32071 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
32072 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
32073 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
32077 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
32079 if (!isAddSubOrSubAddMask(Mask, Op0Even))
32082 // It's a subadd if the vector in the even parity is an FADD.
32083 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
32084 : V2->getOpcode() == ISD::FADD;
32091 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
32092 static SDValue combineShuffleToFMAddSub(SDNode *N,
32093 const X86Subtarget &Subtarget,
32094 SelectionDAG &DAG) {
32095 // We only handle target-independent shuffles.
32096 // FIXME: It would be easy and harmless to use the target shuffle mask
32097 // extraction tool to support more.
32098 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
32101 MVT VT = N->getSimpleValueType(0);
32102 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32103 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
32106 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
32107 SDValue Op0 = N->getOperand(0);
32108 SDValue Op1 = N->getOperand(1);
32109 SDValue FMAdd = Op0, FMSub = Op1;
32110 if (FMSub.getOpcode() != X86ISD::FMSUB)
32111 std::swap(FMAdd, FMSub);
32113 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
32114 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
32115 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
32116 FMAdd.getOperand(2) != FMSub.getOperand(2))
32119 // Check for correct shuffle mask.
32120 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
32122 if (!isAddSubOrSubAddMask(Mask, Op0Even))
32125 // FMAddSub takes zeroth operand from FMSub node.
32127 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
32128 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
32129 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
32130 FMAdd.getOperand(2));
32133 /// Try to combine a shuffle into a target-specific add-sub or
32134 /// mul-add-sub node.
32135 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
32136 const X86Subtarget &Subtarget,
32137 SelectionDAG &DAG) {
32138 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
32141 SDValue Opnd0, Opnd1;
32143 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
32146 MVT VT = N->getSimpleValueType(0);
32149 // Try to generate X86ISD::FMADDSUB node here.
32151 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
32152 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
32153 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
32159 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
32160 // the ADDSUB idiom has been successfully recognized. There are no known
32161 // X86 targets with 512-bit ADDSUB instructions!
32162 if (VT.is512BitVector())
32165 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
32168 // We are looking for a shuffle where both sources are concatenated with undef
32169 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
32170 // if we can express this as a single-source shuffle, that's preferable.
32171 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
32172 const X86Subtarget &Subtarget) {
32173 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
32176 EVT VT = N->getValueType(0);
32178 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
32179 if (!VT.is128BitVector() && !VT.is256BitVector())
32182 if (VT.getVectorElementType() != MVT::i32 &&
32183 VT.getVectorElementType() != MVT::i64 &&
32184 VT.getVectorElementType() != MVT::f32 &&
32185 VT.getVectorElementType() != MVT::f64)
32188 SDValue N0 = N->getOperand(0);
32189 SDValue N1 = N->getOperand(1);
32191 // Check that both sources are concats with undef.
32192 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
32193 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
32194 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
32195 !N1.getOperand(1).isUndef())
32198 // Construct the new shuffle mask. Elements from the first source retain their
32199 // index, but elements from the second source no longer need to skip an undef.
32200 SmallVector<int, 8> Mask;
32201 int NumElts = VT.getVectorNumElements();
32203 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
32204 for (int Elt : SVOp->getMask())
32205 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
32208 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
32210 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
32213 /// Eliminate a redundant shuffle of a horizontal math op.
32214 static SDValue foldShuffleOfHorizOp(SDNode *N) {
32215 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
32218 SDValue HOp = N->getOperand(0);
32219 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
32220 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
32223 // 128-bit horizontal math instructions are defined to operate on adjacent
32224 // lanes of each operand as:
32225 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
32226 // ...similarly for v2f64 and v8i16.
32227 // TODO: Handle UNDEF operands.
32228 if (HOp.getOperand(0) != HOp.getOperand(1))
32231 // When the operands of a horizontal math op are identical, the low half of
32232 // the result is the same as the high half. If the shuffle is also replicating
32233 // low and high halves, we don't need the shuffle.
32234 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
32235 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
32236 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
32237 // but this should be tied to whatever horizontal op matching and shuffle
32238 // canonicalization are producing.
32239 if (HOp.getValueSizeInBits() == 128 &&
32240 (isTargetShuffleEquivalent(Mask, {0, 0}) ||
32241 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
32242 isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
32245 if (HOp.getValueSizeInBits() == 256 &&
32246 (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
32247 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
32248 isTargetShuffleEquivalent(
32249 Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
32255 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
32256 TargetLowering::DAGCombinerInfo &DCI,
32257 const X86Subtarget &Subtarget) {
32259 EVT VT = N->getValueType(0);
32260 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32261 // If we have legalized the vector types, look for blends of FADD and FSUB
32262 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
32263 if (TLI.isTypeLegal(VT)) {
32264 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
32267 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
32271 // During Type Legalization, when promoting illegal vector types,
32272 // the backend might introduce new shuffle dag nodes and bitcasts.
32274 // This code performs the following transformation:
32275 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
32276 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
32278 // We do this only if both the bitcast and the BINOP dag nodes have
32279 // one use. Also, perform this transformation only if the new binary
32280 // operation is legal. This is to avoid introducing dag nodes that
32281 // potentially need to be further expanded (or custom lowered) into a
32282 // less optimal sequence of dag nodes.
32283 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
32284 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
32285 N->getOperand(0).getOpcode() == ISD::BITCAST &&
32286 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
32287 SDValue N0 = N->getOperand(0);
32288 SDValue N1 = N->getOperand(1);
32290 SDValue BC0 = N0.getOperand(0);
32291 EVT SVT = BC0.getValueType();
32292 unsigned Opcode = BC0.getOpcode();
32293 unsigned NumElts = VT.getVectorNumElements();
32295 if (BC0.hasOneUse() && SVT.isVector() &&
32296 SVT.getVectorNumElements() * 2 == NumElts &&
32297 TLI.isOperationLegal(Opcode, VT)) {
32298 bool CanFold = false;
32304 // isOperationLegal lies for integer ops on floating point types.
32305 CanFold = VT.isInteger();
32310 // isOperationLegal lies for floating point ops on integer types.
32311 CanFold = VT.isFloatingPoint();
32315 unsigned SVTNumElts = SVT.getVectorNumElements();
32316 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
32317 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
32318 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
32319 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
32320 CanFold = SVOp->getMaskElt(i) < 0;
32323 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
32324 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
32325 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
32326 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
32331 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
32332 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
32333 // consecutive, non-overlapping, and in the right order.
32334 SmallVector<SDValue, 16> Elts;
32335 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
32336 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
32337 Elts.push_back(Elt);
32344 if (Elts.size() == VT.getVectorNumElements())
32346 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
32349 // For AVX2, we sometimes want to combine
32350 // (vector_shuffle <mask> (concat_vectors t1, undef)
32351 // (concat_vectors t2, undef))
32353 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
32354 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
32355 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
32358 if (isTargetShuffle(N->getOpcode())) {
32360 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
32363 // Try recursively combining arbitrary sequences of x86 shuffle
32364 // instructions into higher-order shuffles. We do this after combining
32365 // specific PSHUF instruction sequences into their minimal form so that we
32366 // can evaluate how many specialized shuffle instructions are involved in
32367 // a particular chain.
32368 if (SDValue Res = combineX86ShufflesRecursively(
32369 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32370 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
32373 // Simplify source operands based on shuffle mask.
32374 // TODO - merge this into combineX86ShufflesRecursively.
32375 APInt KnownUndef, KnownZero;
32376 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
32377 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
32378 return SDValue(N, 0);
32381 // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
32382 // operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
32383 // FIXME: This can probably go away once we default to widening legalization.
32384 if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
32385 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
32386 N->getOperand(0).getOpcode() == ISD::BITCAST &&
32387 N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
32388 SDValue BC = N->getOperand(0);
32389 SDValue MULUDQ = BC.getOperand(0);
32390 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
32391 ArrayRef<int> Mask = SVOp->getMask();
32392 if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
32393 Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
32394 SDValue Op0 = MULUDQ.getOperand(0);
32395 SDValue Op1 = MULUDQ.getOperand(1);
32396 if (Op0.getOpcode() == ISD::BITCAST &&
32397 Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
32398 Op0.getOperand(0).getValueType() == MVT::v4i32) {
32399 ShuffleVectorSDNode *SVOp0 =
32400 cast<ShuffleVectorSDNode>(Op0.getOperand(0));
32401 ArrayRef<int> Mask2 = SVOp0->getMask();
32402 if (Mask2[0] == 0 && Mask2[1] == -1 &&
32403 Mask2[2] == 1 && Mask2[3] == -1) {
32404 Op0 = SVOp0->getOperand(0);
32405 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
32406 Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
32407 return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
32410 if (Op1.getOpcode() == ISD::BITCAST &&
32411 Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
32412 Op1.getOperand(0).getValueType() == MVT::v4i32) {
32413 ShuffleVectorSDNode *SVOp1 =
32414 cast<ShuffleVectorSDNode>(Op1.getOperand(0));
32415 ArrayRef<int> Mask2 = SVOp1->getMask();
32416 if (Mask2[0] == 0 && Mask2[1] == -1 &&
32417 Mask2[2] == 1 && Mask2[3] == -1) {
32418 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
32419 Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
32420 Op1 = SVOp1->getOperand(0);
32421 return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
32430 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
32431 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
32432 TargetLoweringOpt &TLO, unsigned Depth) const {
32433 int NumElts = DemandedElts.getBitWidth();
32434 unsigned Opc = Op.getOpcode();
32435 EVT VT = Op.getValueType();
32437 // Handle special case opcodes.
32441 case X86ISD::VSRA: {
32442 // We only need the bottom 64-bits of the (128-bit) shift amount.
32443 SDValue Amt = Op.getOperand(1);
32444 MVT AmtVT = Amt.getSimpleValueType();
32445 assert(AmtVT.is128BitVector() && "Unexpected value type");
32446 APInt AmtUndef, AmtZero;
32447 unsigned NumAmtElts = AmtVT.getVectorNumElements();
32448 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
32449 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
32454 case X86ISD::VSHLI:
32455 case X86ISD::VSRLI:
32456 case X86ISD::VSRAI: {
32457 SDValue Src = Op.getOperand(0);
32459 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
32462 // TODO convert SrcUndef to KnownUndef.
32465 case X86ISD::CVTSI2P:
32466 case X86ISD::CVTUI2P: {
32467 SDValue Src = Op.getOperand(0);
32468 MVT SrcVT = Src.getSimpleValueType();
32469 APInt SrcUndef, SrcZero;
32470 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
32471 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
32476 case X86ISD::PACKSS:
32477 case X86ISD::PACKUS: {
32478 APInt DemandedLHS, DemandedRHS;
32479 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
32481 APInt SrcUndef, SrcZero;
32482 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
32483 SrcZero, TLO, Depth + 1))
32485 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
32486 SrcZero, TLO, Depth + 1))
32490 case X86ISD::VBROADCAST: {
32491 SDValue Src = Op.getOperand(0);
32492 MVT SrcVT = Src.getSimpleValueType();
32493 if (!SrcVT.isVector())
32495 // Don't bother broadcasting if we just need the 0'th element.
32496 if (DemandedElts == 1) {
32497 if(Src.getValueType() != VT)
32498 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
32500 return TLO.CombineTo(Op, Src);
32502 APInt SrcUndef, SrcZero;
32503 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
32504 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
32509 case X86ISD::PSHUFB: {
32510 // TODO - simplify other variable shuffle masks.
32511 SDValue Mask = Op.getOperand(1);
32512 APInt MaskUndef, MaskZero;
32513 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
32520 // Simplify target shuffles.
32521 if (!isTargetShuffle(Opc) || !VT.isSimple())
32524 // Get target shuffle mask.
32526 SmallVector<int, 64> OpMask;
32527 SmallVector<SDValue, 2> OpInputs;
32528 if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
32532 // Shuffle inputs must be the same type as the result.
32533 if (llvm::any_of(OpInputs,
32534 [VT](SDValue V) { return VT != V.getValueType(); }))
32537 // Clear known elts that might have been set above.
32538 KnownZero.clearAllBits();
32539 KnownUndef.clearAllBits();
32541 // Check if shuffle mask can be simplified to undef/zero/identity.
32542 int NumSrcs = OpInputs.size();
32543 for (int i = 0; i != NumElts; ++i) {
32544 int &M = OpMask[i];
32545 if (!DemandedElts[i])
32546 M = SM_SentinelUndef;
32547 else if (0 <= M && OpInputs[M / NumElts].isUndef())
32548 M = SM_SentinelUndef;
32551 if (isUndefInRange(OpMask, 0, NumElts)) {
32552 KnownUndef.setAllBits();
32553 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
32555 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
32556 KnownZero.setAllBits();
32557 return TLO.CombineTo(
32558 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
32560 for (int Src = 0; Src != NumSrcs; ++Src)
32561 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
32562 return TLO.CombineTo(Op, OpInputs[Src]);
32564 // Attempt to simplify inputs.
32565 for (int Src = 0; Src != NumSrcs; ++Src) {
32566 int Lo = Src * NumElts;
32567 APInt SrcElts = APInt::getNullValue(NumElts);
32568 for (int i = 0; i != NumElts; ++i)
32569 if (DemandedElts[i]) {
32570 int M = OpMask[i] - Lo;
32571 if (0 <= M && M < NumElts)
32575 APInt SrcUndef, SrcZero;
32576 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
32581 // Extract known zero/undef elements.
32582 // TODO - Propagate input undef/zero elts.
32583 for (int i = 0; i != NumElts; ++i) {
32584 if (OpMask[i] == SM_SentinelUndef)
32585 KnownUndef.setBit(i);
32586 if (OpMask[i] == SM_SentinelZero)
32587 KnownZero.setBit(i);
32593 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
32594 SDValue Op, const APInt &OriginalDemandedBits,
32595 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
32596 unsigned Depth) const {
32597 EVT VT = Op.getValueType();
32598 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
32599 unsigned Opc = Op.getOpcode();
32601 case X86ISD::PMULDQ:
32602 case X86ISD::PMULUDQ: {
32603 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
32605 SDValue LHS = Op.getOperand(0);
32606 SDValue RHS = Op.getOperand(1);
32607 // FIXME: Can we bound this better?
32608 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
32609 if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
32611 if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
32615 case X86ISD::VSHLI: {
32616 SDValue Op0 = Op.getOperand(0);
32617 SDValue Op1 = Op.getOperand(1);
32619 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
32620 if (ShiftImm->getAPIntValue().uge(BitWidth))
32623 unsigned ShAmt = ShiftImm->getZExtValue();
32624 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
32626 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
32627 // single shift. We can do this if the bottom bits (which are shifted
32628 // out) are never demanded.
32629 if (Op0.getOpcode() == X86ISD::VSRLI &&
32630 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
32631 if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {
32632 if (Shift2Imm->getAPIntValue().ult(BitWidth)) {
32633 int Diff = ShAmt - Shift2Imm->getZExtValue();
32635 return TLO.CombineTo(Op, Op0.getOperand(0));
32637 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
32638 SDValue NewShift = TLO.DAG.getNode(
32639 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
32640 TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
32641 return TLO.CombineTo(Op, NewShift);
32646 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
32650 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
32651 Known.Zero <<= ShAmt;
32652 Known.One <<= ShAmt;
32654 // Low bits known zero.
32655 Known.Zero.setLowBits(ShAmt);
32659 case X86ISD::VSRLI: {
32660 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
32661 if (ShiftImm->getAPIntValue().uge(BitWidth))
32664 unsigned ShAmt = ShiftImm->getZExtValue();
32665 APInt DemandedMask = OriginalDemandedBits << ShAmt;
32667 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
32668 OriginalDemandedElts, Known, TLO, Depth + 1))
32671 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
32672 Known.Zero.lshrInPlace(ShAmt);
32673 Known.One.lshrInPlace(ShAmt);
32675 // High bits known zero.
32676 Known.Zero.setHighBits(ShAmt);
32680 case X86ISD::VSRAI: {
32681 SDValue Op0 = Op.getOperand(0);
32682 SDValue Op1 = Op.getOperand(1);
32684 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
32685 if (ShiftImm->getAPIntValue().uge(BitWidth))
32688 unsigned ShAmt = ShiftImm->getZExtValue();
32689 APInt DemandedMask = OriginalDemandedBits << ShAmt;
32691 // If we just want the sign bit then we don't need to shift it.
32692 if (OriginalDemandedBits.isSignMask())
32693 return TLO.CombineTo(Op, Op0);
32695 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
32696 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
32697 SDValue Op00 = Op0.getOperand(0);
32698 unsigned NumSignBits =
32699 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
32700 if (ShAmt < NumSignBits)
32701 return TLO.CombineTo(Op, Op00);
32704 // If any of the demanded bits are produced by the sign extension, we also
32705 // demand the input sign bit.
32706 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
32707 DemandedMask.setSignBit();
32709 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
32713 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
32714 Known.Zero.lshrInPlace(ShAmt);
32715 Known.One.lshrInPlace(ShAmt);
32717 // If the input sign bit is known to be zero, or if none of the top bits
32718 // are demanded, turn this into an unsigned shift right.
32719 if (Known.Zero[BitWidth - ShAmt - 1] ||
32720 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
32721 return TLO.CombineTo(
32722 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
32724 // High bits are known one.
32725 if (Known.One[BitWidth - ShAmt - 1])
32726 Known.One.setHighBits(ShAmt);
32730 case X86ISD::MOVMSK: {
32731 SDValue Src = Op.getOperand(0);
32732 MVT SrcVT = Src.getSimpleValueType();
32733 unsigned SrcBits = SrcVT.getScalarSizeInBits();
32734 unsigned NumElts = SrcVT.getVectorNumElements();
32736 // If we don't need the sign bits at all just return zero.
32737 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
32738 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
32740 // Only demand the vector elements of the sign bits we need.
32741 APInt KnownUndef, KnownZero;
32742 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
32743 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
32747 Known.Zero = KnownZero.zextOrSelf(BitWidth);
32748 Known.Zero.setHighBits(BitWidth - NumElts);
32750 // MOVMSK only uses the MSB from each vector element.
32751 KnownBits KnownSrc;
32752 if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
32753 KnownSrc, TLO, Depth + 1))
32756 if (KnownSrc.One[SrcBits - 1])
32757 Known.One.setLowBits(NumElts);
32758 else if (KnownSrc.Zero[SrcBits - 1])
32759 Known.Zero.setLowBits(NumElts);
32764 return TargetLowering::SimplifyDemandedBitsForTargetNode(
32765 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
32768 /// Check if a vector extract from a target-specific shuffle of a load can be
32769 /// folded into a single element load.
32770 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
32771 /// shuffles have been custom lowered so we need to handle those here.
32772 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
32773 TargetLowering::DAGCombinerInfo &DCI) {
32774 if (DCI.isBeforeLegalizeOps())
32777 SDValue InVec = N->getOperand(0);
32778 SDValue EltNo = N->getOperand(1);
32779 EVT EltVT = N->getValueType(0);
32781 if (!isa<ConstantSDNode>(EltNo))
32784 EVT OriginalVT = InVec.getValueType();
32786 // Peek through bitcasts, don't duplicate a load with other uses.
32787 InVec = peekThroughOneUseBitcasts(InVec);
32789 EVT CurrentVT = InVec.getValueType();
32790 if (!CurrentVT.isVector() ||
32791 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
32794 if (!isTargetShuffle(InVec.getOpcode()))
32797 // Don't duplicate a load with other uses.
32798 if (!InVec.hasOneUse())
32801 SmallVector<int, 16> ShuffleMask;
32802 SmallVector<SDValue, 2> ShuffleOps;
32804 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
32805 ShuffleOps, ShuffleMask, UnaryShuffle))
32808 // Select the input vector, guarding against out of range extract vector.
32809 unsigned NumElems = CurrentVT.getVectorNumElements();
32810 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
32811 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
32813 if (Idx == SM_SentinelZero)
32814 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
32815 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
32816 if (Idx == SM_SentinelUndef)
32817 return DAG.getUNDEF(EltVT);
32819 // Bail if any mask element is SM_SentinelZero - getVectorShuffle below
32820 // won't handle it.
32821 if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
32824 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
32825 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];
32827 // If inputs to shuffle are the same for both ops, then allow 2 uses
32828 unsigned AllowedUses =
32829 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
32831 if (LdNode.getOpcode() == ISD::BITCAST) {
32832 // Don't duplicate a load with other uses.
32833 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
32836 AllowedUses = 1; // only allow 1 load use if we have a bitcast
32837 LdNode = LdNode.getOperand(0);
32840 if (!ISD::isNormalLoad(LdNode.getNode()))
32843 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
32845 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
32848 // If there's a bitcast before the shuffle, check if the load type and
32849 // alignment is valid.
32850 unsigned Align = LN0->getAlignment();
32851 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32852 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
32853 EltVT.getTypeForEVT(*DAG.getContext()));
32855 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
32858 // All checks match so transform back to vector_shuffle so that DAG combiner
32859 // can finish the job
32862 // Create shuffle node taking into account the case that its a unary shuffle
32863 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
32864 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
32866 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
32867 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
32871 // Try to match patterns such as
32872 // (i16 bitcast (v16i1 x))
32874 // (i16 movmsk (16i8 sext (v16i1 x)))
32875 // before the illegal vector is scalarized on subtargets that don't have legal
32877 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
32878 const X86Subtarget &Subtarget) {
32879 EVT VT = BitCast.getValueType();
32880 SDValue N0 = BitCast.getOperand(0);
32881 EVT VecVT = N0->getValueType(0);
32883 if (!VT.isScalarInteger() || !VecVT.isSimple())
32886 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
32887 // movmskb even with avx512. This will be better than truncating to vXi1 and
32888 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
32889 // vpcmpeqb/vpcmpgtb.
32890 bool IsTruncated = N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
32891 (N0.getOperand(0).getValueType() == MVT::v16i8 ||
32892 N0.getOperand(0).getValueType() == MVT::v32i8 ||
32893 N0.getOperand(0).getValueType() == MVT::v64i8);
32895 // With AVX512 vxi1 types are legal and we prefer using k-regs.
32896 // MOVMSK is supported in SSE2 or later.
32897 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
32900 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
32901 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
32902 // v8i16 and v16i16.
32903 // For these two cases, we can shuffle the upper element bytes to a
32904 // consecutive sequence at the start of the vector and treat the results as
32905 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
32906 // for v16i16 this is not the case, because the shuffle is expensive, so we
32907 // avoid sign-extending to this type entirely.
32908 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
32909 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
32911 switch (VecVT.getSimpleVT().SimpleTy) {
32915 SExtVT = MVT::v2i64;
32918 SExtVT = MVT::v4i32;
32919 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
32920 // sign-extend to a 256-bit operation to avoid truncation.
32921 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
32922 N0->getOperand(0).getValueType().is256BitVector()) {
32923 SExtVT = MVT::v4i64;
32927 SExtVT = MVT::v8i16;
32928 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
32929 // sign-extend to a 256-bit operation to match the compare.
32930 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
32931 // 256-bit because the shuffle is cheaper than sign extending the result of
32933 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
32934 (N0->getOperand(0).getValueType().is256BitVector() ||
32935 N0->getOperand(0).getValueType().is512BitVector())) {
32936 SExtVT = MVT::v8i32;
32940 SExtVT = MVT::v16i8;
32941 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
32942 // it is not profitable to sign-extend to 256-bit because this will
32943 // require an extra cross-lane shuffle which is more expensive than
32944 // truncating the result of the compare to 128-bits.
32947 SExtVT = MVT::v32i8;
32950 // If we have AVX512F, but not AVX512BW and the input is truncated from
32951 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
32952 if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
32953 SExtVT = MVT::v64i8;
32960 SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, N0);
32962 if (SExtVT == MVT::v64i8) {
32964 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32965 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32966 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32967 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32968 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32969 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32970 DAG.getConstant(32, DL, MVT::i8));
32971 V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32972 } else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
32973 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32975 if (SExtVT == MVT::v8i16)
32976 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
32977 DAG.getUNDEF(MVT::v8i16));
32978 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32980 return DAG.getZExtOrTrunc(V, DL, VT);
32983 // Convert a vXi1 constant build vector to the same width scalar integer.
32984 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
32985 EVT SrcVT = Op.getValueType();
32986 assert(SrcVT.getVectorElementType() == MVT::i1 &&
32987 "Expected a vXi1 vector");
32988 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
32989 "Expected a constant build vector");
32991 APInt Imm(SrcVT.getVectorNumElements(), 0);
32992 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
32993 SDValue In = Op.getOperand(Idx);
32994 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
32997 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
32998 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
33001 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
33002 TargetLowering::DAGCombinerInfo &DCI,
33003 const X86Subtarget &Subtarget) {
33004 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
33006 if (!DCI.isBeforeLegalizeOps())
33009 // Only do this if we have k-registers.
33010 if (!Subtarget.hasAVX512())
33013 EVT DstVT = N->getValueType(0);
33014 SDValue Op = N->getOperand(0);
33015 EVT SrcVT = Op.getValueType();
33017 if (!Op.hasOneUse())
33020 // Look for logic ops.
33021 if (Op.getOpcode() != ISD::AND &&
33022 Op.getOpcode() != ISD::OR &&
33023 Op.getOpcode() != ISD::XOR)
33026 // Make sure we have a bitcast between mask registers and a scalar type.
33027 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
33028 DstVT.isScalarInteger()) &&
33029 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
33030 SrcVT.isScalarInteger()))
33033 SDValue LHS = Op.getOperand(0);
33034 SDValue RHS = Op.getOperand(1);
33036 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
33037 LHS.getOperand(0).getValueType() == DstVT)
33038 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
33039 DAG.getBitcast(DstVT, RHS));
33041 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
33042 RHS.getOperand(0).getValueType() == DstVT)
33043 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
33044 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
33046 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
33047 // Most of these have to move a constant from the scalar domain anyway.
33048 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
33049 RHS = combinevXi1ConstantToInteger(RHS, DAG);
33050 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
33051 DAG.getBitcast(DstVT, LHS), RHS);
33057 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
33058 const X86Subtarget &Subtarget) {
33060 unsigned NumElts = N.getNumOperands();
33062 auto *BV = cast<BuildVectorSDNode>(N);
33063 SDValue Splat = BV->getSplatValue();
33065 // Build MMX element from integer GPR or SSE float values.
33066 auto CreateMMXElement = [&](SDValue V) {
33068 return DAG.getUNDEF(MVT::x86mmx);
33069 if (V.getValueType().isFloatingPoint()) {
33070 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
33071 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
33072 V = DAG.getBitcast(MVT::v2i64, V);
33073 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
33075 V = DAG.getBitcast(MVT::i32, V);
33077 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
33079 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
33082 // Convert build vector ops to MMX data in the bottom elements.
33083 SmallVector<SDValue, 8> Ops;
33085 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
33087 if (Splat.isUndef())
33088 return DAG.getUNDEF(MVT::x86mmx);
33090 Splat = CreateMMXElement(Splat);
33092 if (Subtarget.hasSSE1()) {
33093 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
33095 Splat = DAG.getNode(
33096 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
33097 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
33100 // Use PSHUFW to repeat 16-bit elements.
33101 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
33102 return DAG.getNode(
33103 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
33104 DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
33105 DAG.getConstant(ShufMask, DL, MVT::i8));
33107 Ops.append(NumElts, Splat);
33109 for (unsigned i = 0; i != NumElts; ++i)
33110 Ops.push_back(CreateMMXElement(N.getOperand(i)));
33113 // Use tree of PUNPCKLs to build up general MMX vector.
33114 while (Ops.size() > 1) {
33115 unsigned NumOps = Ops.size();
33116 unsigned IntrinOp =
33117 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
33118 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
33119 : Intrinsic::x86_mmx_punpcklbw));
33120 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
33121 for (unsigned i = 0; i != NumOps; i += 2)
33122 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
33123 Ops[i], Ops[i + 1]);
33124 Ops.resize(NumOps / 2);
33130 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
33131 TargetLowering::DAGCombinerInfo &DCI,
33132 const X86Subtarget &Subtarget) {
33133 SDValue N0 = N->getOperand(0);
33134 EVT VT = N->getValueType(0);
33135 EVT SrcVT = N0.getValueType();
33137 // Try to match patterns such as
33138 // (i16 bitcast (v16i1 x))
33140 // (i16 movmsk (16i8 sext (v16i1 x)))
33141 // before the setcc result is scalarized on subtargets that don't have legal
33143 if (DCI.isBeforeLegalize()) {
33144 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
33147 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
33148 // type, widen both sides to avoid a trip through memory.
33149 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
33150 Subtarget.hasAVX512()) {
33152 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
33153 N0 = DAG.getBitcast(MVT::v8i1, N0);
33154 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
33155 DAG.getIntPtrConstant(0, dl));
33158 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
33159 // type, widen both sides to avoid a trip through memory.
33160 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
33161 Subtarget.hasAVX512()) {
33163 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
33164 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
33166 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
33167 N0 = DAG.getBitcast(MVT::i8, N0);
33168 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
33172 // Since MMX types are special and don't usually play with other vector types,
33173 // it's better to handle them early to be sure we emit efficient code by
33174 // avoiding store-load conversions.
33175 if (VT == MVT::x86mmx) {
33176 // Detect MMX constant vectors.
33178 SmallVector<APInt, 1> EltBits;
33179 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
33181 // Handle zero-extension of i32 with MOVD.
33182 if (EltBits[0].countLeadingZeros() >= 32)
33183 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
33184 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
33185 // Else, bitcast to a double.
33186 // TODO - investigate supporting sext 32-bit immediates on x86_64.
33187 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
33188 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
33191 // Detect bitcasts to x86mmx low word.
33192 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
33193 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
33194 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
33195 bool LowUndef = true, AllUndefOrZero = true;
33196 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
33197 SDValue Op = N0.getOperand(i);
33198 LowUndef &= Op.isUndef() || (i >= e/2);
33199 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
33201 if (AllUndefOrZero) {
33202 SDValue N00 = N0.getOperand(0);
33204 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
33205 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
33206 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
33210 // Detect bitcasts of 64-bit build vectors and convert to a
33211 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
33213 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
33214 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
33215 SrcVT == MVT::v8i8))
33216 return createMMXBuildVector(N0, DAG, Subtarget);
33218 // Detect bitcasts between element or subvector extraction to x86mmx.
33219 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
33220 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
33221 isNullConstant(N0.getOperand(1))) {
33222 SDValue N00 = N0.getOperand(0);
33223 if (N00.getValueType().is128BitVector())
33224 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
33225 DAG.getBitcast(MVT::v2i64, N00));
33228 // Detect bitcasts from FP_TO_SINT to x86mmx.
33229 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
33231 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
33232 DAG.getUNDEF(MVT::v2i32));
33233 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
33234 DAG.getBitcast(MVT::v2i64, Res));
33238 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
33239 // most of these to scalar anyway.
33240 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
33241 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
33242 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
33243 return combinevXi1ConstantToInteger(N0, DAG);
33246 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
33247 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
33248 isa<ConstantSDNode>(N0)) {
33249 auto *C = cast<ConstantSDNode>(N0);
33250 if (C->isAllOnesValue())
33251 return DAG.getConstant(1, SDLoc(N0), VT);
33252 if (C->isNullValue())
33253 return DAG.getConstant(0, SDLoc(N0), VT);
33256 // Try to remove bitcasts from input and output of mask arithmetic to
33257 // remove GPR<->K-register crossings.
33258 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
33261 // Convert a bitcasted integer logic operation that has one bitcasted
33262 // floating-point operand into a floating-point logic operation. This may
33263 // create a load of a constant, but that is cheaper than materializing the
33264 // constant in an integer register and transferring it to an SSE register or
33265 // transferring the SSE operand to integer register and back.
33267 switch (N0.getOpcode()) {
33268 case ISD::AND: FPOpcode = X86ISD::FAND; break;
33269 case ISD::OR: FPOpcode = X86ISD::FOR; break;
33270 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
33271 default: return SDValue();
33274 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
33275 (Subtarget.hasSSE2() && VT == MVT::f64)))
33278 SDValue LogicOp0 = N0.getOperand(0);
33279 SDValue LogicOp1 = N0.getOperand(1);
33282 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
33283 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
33284 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
33285 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
33286 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
33287 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
33289 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
33290 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
33291 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
33292 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
33293 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
33294 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
33300 // Given a select, detect the following pattern:
33301 // 1: %2 = zext <N x i8> %0 to <N x i32>
33302 // 2: %3 = zext <N x i8> %1 to <N x i32>
33303 // 3: %4 = sub nsw <N x i32> %2, %3
33304 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
33305 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
33306 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
33307 // This is useful as it is the input into a SAD pattern.
33308 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
33310 // Check the condition of the select instruction is greater-than.
33311 SDValue SetCC = Select->getOperand(0);
33312 if (SetCC.getOpcode() != ISD::SETCC)
33314 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
33315 if (CC != ISD::SETGT && CC != ISD::SETLT)
33318 SDValue SelectOp1 = Select->getOperand(1);
33319 SDValue SelectOp2 = Select->getOperand(2);
33321 // The following instructions assume SelectOp1 is the subtraction operand
33322 // and SelectOp2 is the negation operand.
33323 // In the case of SETLT this is the other way around.
33324 if (CC == ISD::SETLT)
33325 std::swap(SelectOp1, SelectOp2);
33327 // The second operand of the select should be the negation of the first
33328 // operand, which is implemented as 0 - SelectOp1.
33329 if (!(SelectOp2.getOpcode() == ISD::SUB &&
33330 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
33331 SelectOp2.getOperand(1) == SelectOp1))
33334 // The first operand of SetCC is the first operand of the select, which is the
33335 // difference between the two input vectors.
33336 if (SetCC.getOperand(0) != SelectOp1)
33339 // In SetLT case, The second operand of the comparison can be either 1 or 0.
33341 if ((CC == ISD::SETLT) &&
33342 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
33343 SplatVal.isOneValue()) ||
33344 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
33347 // In SetGT case, The second operand of the comparison can be either -1 or 0.
33348 if ((CC == ISD::SETGT) &&
33349 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
33350 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
33353 // The first operand of the select is the difference between the two input
33355 if (SelectOp1.getOpcode() != ISD::SUB)
33358 Op0 = SelectOp1.getOperand(0);
33359 Op1 = SelectOp1.getOperand(1);
33361 // Check if the operands of the sub are zero-extended from vectors of i8.
33362 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
33363 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
33364 Op1.getOpcode() != ISD::ZERO_EXTEND ||
33365 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
33371 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
33373 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
33374 const SDValue &Zext1, const SDLoc &DL,
33375 const X86Subtarget &Subtarget) {
33376 // Find the appropriate width for the PSADBW.
33377 EVT InVT = Zext0.getOperand(0).getValueType();
33378 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
33380 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
33381 // fill in the missing vector elements with 0.
33382 unsigned NumConcat = RegSize / InVT.getSizeInBits();
33383 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
33384 Ops[0] = Zext0.getOperand(0);
33385 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
33386 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
33387 Ops[0] = Zext1.getOperand(0);
33388 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
33390 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
33391 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33392 ArrayRef<SDValue> Ops) {
33393 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
33394 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
33396 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
33397 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
33401 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
33403 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
33404 const X86Subtarget &Subtarget) {
33405 // Bail without SSE41.
33406 if (!Subtarget.hasSSE41())
33409 EVT ExtractVT = Extract->getValueType(0);
33410 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
33413 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
33414 ISD::NodeType BinOp;
33415 SDValue Src = DAG.matchBinOpReduction(
33416 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
33420 EVT SrcVT = Src.getValueType();
33421 EVT SrcSVT = SrcVT.getScalarType();
33422 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
33426 SDValue MinPos = Src;
33428 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
33429 while (SrcVT.getSizeInBits() > 128) {
33430 unsigned NumElts = SrcVT.getVectorNumElements();
33431 unsigned NumSubElts = NumElts / 2;
33432 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
33433 unsigned SubSizeInBits = SrcVT.getSizeInBits();
33434 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
33435 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
33436 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
33438 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
33439 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
33440 "Unexpected value type");
33442 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
33443 // to flip the value accordingly.
33445 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
33446 if (BinOp == ISD::SMAX)
33447 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
33448 else if (BinOp == ISD::SMIN)
33449 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
33450 else if (BinOp == ISD::UMAX)
33451 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
33454 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
33456 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
33457 // shuffling each upper element down and insert zeros. This means that the
33458 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
33459 // ready for the PHMINPOS.
33460 if (ExtractVT == MVT::i8) {
33461 SDValue Upper = DAG.getVectorShuffle(
33462 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
33463 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
33464 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
33467 // Perform the PHMINPOS on a v8i16 vector,
33468 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
33469 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
33470 MinPos = DAG.getBitcast(SrcVT, MinPos);
33473 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
33475 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
33476 DAG.getIntPtrConstant(0, DL));
33479 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
33480 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
33482 const X86Subtarget &Subtarget) {
33483 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
33484 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
33487 EVT ExtractVT = Extract->getValueType(0);
33488 unsigned BitWidth = ExtractVT.getSizeInBits();
33489 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
33490 ExtractVT != MVT::i8)
33493 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
33494 ISD::NodeType BinOp;
33495 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
33499 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
33500 // which we can't support here for now.
33501 if (Match.getScalarValueSizeInBits() != BitWidth)
33504 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
33505 unsigned MatchSizeInBits = Match.getValueSizeInBits();
33506 if (!(MatchSizeInBits == 128 ||
33507 (MatchSizeInBits == 256 &&
33508 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
33511 // Don't bother performing this for 2-element vectors.
33512 if (Match.getValueType().getVectorNumElements() <= 2)
33515 // Check that we are extracting a reduction of all sign bits.
33516 if (DAG.ComputeNumSignBits(Match) != BitWidth)
33519 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
33521 if (64 == BitWidth || 32 == BitWidth)
33522 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
33523 MatchSizeInBits / BitWidth);
33525 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
33528 ISD::CondCode CondCode;
33529 if (BinOp == ISD::OR) {
33530 // any_of -> MOVMSK != 0
33531 CompareBits = APInt::getNullValue(32);
33532 CondCode = ISD::CondCode::SETNE;
33534 // all_of -> MOVMSK == ((1 << NumElts) - 1)
33535 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
33536 CondCode = ISD::CondCode::SETEQ;
33539 // Perform the select as i32/i64 and then truncate to avoid partial register
33541 unsigned ResWidth = std::max(BitWidth, 32u);
33542 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
33544 SDValue Zero = DAG.getConstant(0, DL, ResVT);
33545 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
33546 SDValue Res = DAG.getBitcast(MaskVT, Match);
33547 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
33548 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
33549 Ones, Zero, CondCode);
33550 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
33553 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
33554 const X86Subtarget &Subtarget) {
33555 // PSADBW is only supported on SSE2 and up.
33556 if (!Subtarget.hasSSE2())
33559 // Verify the type we're extracting from is any integer type above i16.
33560 EVT VT = Extract->getOperand(0).getValueType();
33561 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
33564 unsigned RegSize = 128;
33565 if (Subtarget.useBWIRegs())
33567 else if (Subtarget.hasAVX())
33570 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
33571 // TODO: We should be able to handle larger vectors by splitting them before
33572 // feeding them into several SADs, and then reducing over those.
33573 if (RegSize / VT.getVectorNumElements() < 8)
33576 // Match shuffle + add pyramid.
33577 ISD::NodeType BinOp;
33578 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
33580 // The operand is expected to be zero extended from i8
33581 // (verified in detectZextAbsDiff).
33582 // In order to convert to i64 and above, additional any/zero/sign
33583 // extend is expected.
33584 // The zero extend from 32 bit has no mathematical effect on the result.
33585 // Also the sign extend is basically zero extend
33586 // (extends the sign bit which is zero).
33587 // So it is correct to skip the sign/zero extend instruction.
33588 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
33589 Root.getOpcode() == ISD::ZERO_EXTEND ||
33590 Root.getOpcode() == ISD::ANY_EXTEND))
33591 Root = Root.getOperand(0);
33593 // If there was a match, we want Root to be a select that is the root of an
33594 // abs-diff pattern.
33595 if (!Root || (Root.getOpcode() != ISD::VSELECT))
33598 // Check whether we have an abs-diff pattern feeding into the select.
33599 SDValue Zext0, Zext1;
33600 if (!detectZextAbsDiff(Root, Zext0, Zext1))
33603 // Create the SAD instruction.
33605 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
33607 // If the original vector was wider than 8 elements, sum over the results
33608 // in the SAD vector.
33609 unsigned Stages = Log2_32(VT.getVectorNumElements());
33610 MVT SadVT = SAD.getSimpleValueType();
33612 unsigned SadElems = SadVT.getVectorNumElements();
33614 for(unsigned i = Stages - 3; i > 0; --i) {
33615 SmallVector<int, 16> Mask(SadElems, -1);
33616 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
33617 Mask[j] = MaskEnd + j;
33620 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
33621 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
33625 MVT Type = Extract->getSimpleValueType(0);
33626 unsigned TypeSizeInBits = Type.getSizeInBits();
33627 // Return the lowest TypeSizeInBits bits.
33628 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
33629 SAD = DAG.getBitcast(ResVT, SAD);
33630 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
33631 Extract->getOperand(1));
33634 // Attempt to peek through a target shuffle and extract the scalar from the
33636 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
33637 TargetLowering::DAGCombinerInfo &DCI,
33638 const X86Subtarget &Subtarget) {
33639 if (DCI.isBeforeLegalizeOps())
33642 SDValue Src = N->getOperand(0);
33643 SDValue Idx = N->getOperand(1);
33645 EVT VT = N->getValueType(0);
33646 EVT SrcVT = Src.getValueType();
33647 EVT SrcSVT = SrcVT.getVectorElementType();
33648 unsigned NumSrcElts = SrcVT.getVectorNumElements();
33650 // Don't attempt this for boolean mask vectors or unknown extraction indices.
33651 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
33654 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
33655 if (X86ISD::VBROADCAST == Src.getOpcode() &&
33656 Src.getOperand(0).getValueType() == VT)
33657 return Src.getOperand(0);
33659 // Resolve the target shuffle inputs and mask.
33660 SmallVector<int, 16> Mask;
33661 SmallVector<SDValue, 2> Ops;
33662 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
33665 // Attempt to narrow/widen the shuffle mask to the correct size.
33666 if (Mask.size() != NumSrcElts) {
33667 if ((NumSrcElts % Mask.size()) == 0) {
33668 SmallVector<int, 16> ScaledMask;
33669 int Scale = NumSrcElts / Mask.size();
33670 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
33671 Mask = std::move(ScaledMask);
33672 } else if ((Mask.size() % NumSrcElts) == 0) {
33673 // Simplify Mask based on demanded element.
33674 int ExtractIdx = (int)N->getConstantOperandVal(1);
33675 int Scale = Mask.size() / NumSrcElts;
33676 int Lo = Scale * ExtractIdx;
33677 int Hi = Scale * (ExtractIdx + 1);
33678 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
33679 if (i < Lo || Hi <= i)
33680 Mask[i] = SM_SentinelUndef;
33682 SmallVector<int, 16> WidenedMask;
33683 while (Mask.size() > NumSrcElts &&
33684 canWidenShuffleElements(Mask, WidenedMask))
33685 Mask = std::move(WidenedMask);
33686 // TODO - investigate support for wider shuffle masks with known upper
33687 // undef/zero elements for implicit zero-extension.
33691 // Check if narrowing/widening failed.
33692 if (Mask.size() != NumSrcElts)
33695 int SrcIdx = Mask[N->getConstantOperandVal(1)];
33698 // If the shuffle source element is undef/zero then we can just accept it.
33699 if (SrcIdx == SM_SentinelUndef)
33700 return DAG.getUNDEF(VT);
33702 if (SrcIdx == SM_SentinelZero)
33703 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
33704 : DAG.getConstant(0, dl, VT);
33706 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
33707 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
33708 SrcIdx = SrcIdx % Mask.size();
33710 // We can only extract other elements from 128-bit vectors and in certain
33711 // circumstances, depending on SSE-level.
33712 // TODO: Investigate using extract_subvector for larger vectors.
33713 // TODO: Investigate float/double extraction if it will be just stored.
33714 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
33715 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
33716 assert(SrcSVT == VT && "Unexpected extraction type");
33717 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
33718 DAG.getIntPtrConstant(SrcIdx, dl));
33721 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
33722 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
33723 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
33724 "Unexpected extraction type");
33725 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
33726 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
33727 DAG.getIntPtrConstant(SrcIdx, dl));
33728 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
33734 /// Detect vector gather/scatter index generation and convert it from being a
33735 /// bunch of shuffles and extracts into a somewhat faster sequence.
33736 /// For i686, the best sequence is apparently storing the value and loading
33737 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
33738 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
33739 TargetLowering::DAGCombinerInfo &DCI,
33740 const X86Subtarget &Subtarget) {
33741 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
33744 // TODO - Remove this once we can handle the implicit zero-extension of
33745 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
33746 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
33747 // combineBasicSADPattern.
33748 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
33751 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
33754 SDValue InputVector = N->getOperand(0);
33755 SDValue EltIdx = N->getOperand(1);
33757 EVT SrcVT = InputVector.getValueType();
33758 EVT VT = N->getValueType(0);
33759 SDLoc dl(InputVector);
33761 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
33762 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
33763 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
33764 SDValue MMXSrc = InputVector.getOperand(0);
33766 // The bitcast source is a direct mmx result.
33767 if (MMXSrc.getValueType() == MVT::x86mmx)
33768 return DAG.getBitcast(VT, InputVector);
33771 // Detect mmx to i32 conversion through a v2i32 elt extract.
33772 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
33773 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
33774 SDValue MMXSrc = InputVector.getOperand(0);
33776 // The bitcast source is a direct mmx result.
33777 if (MMXSrc.getValueType() == MVT::x86mmx)
33778 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
33781 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
33782 isa<ConstantSDNode>(EltIdx) &&
33783 isa<ConstantSDNode>(InputVector.getOperand(0))) {
33784 uint64_t ExtractedElt = N->getConstantOperandVal(1);
33785 auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
33786 const APInt &InputValue = InputC->getAPIntValue();
33787 uint64_t Res = InputValue[ExtractedElt];
33788 return DAG.getConstant(Res, dl, MVT::i1);
33791 // Check whether this extract is the root of a sum of absolute differences
33792 // pattern. This has to be done here because we really want it to happen
33793 // pre-legalization,
33794 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
33797 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
33798 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
33801 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
33802 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
33808 /// If a vector select has an operand that is -1 or 0, try to simplify the
33809 /// select to a bitwise logic operation.
33810 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
33812 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
33813 TargetLowering::DAGCombinerInfo &DCI,
33814 const X86Subtarget &Subtarget) {
33815 SDValue Cond = N->getOperand(0);
33816 SDValue LHS = N->getOperand(1);
33817 SDValue RHS = N->getOperand(2);
33818 EVT VT = LHS.getValueType();
33819 EVT CondVT = Cond.getValueType();
33821 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33823 if (N->getOpcode() != ISD::VSELECT)
33826 assert(CondVT.isVector() && "Vector select expects a vector selector!");
33828 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
33829 // Check if the first operand is all zeros and Cond type is vXi1.
33830 // This situation only applies to avx512.
33831 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
33832 CondVT.getVectorElementType() == MVT::i1) {
33833 // Invert the cond to not(cond) : xor(op,allones)=not(op)
33834 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
33835 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
33836 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
33839 // To use the condition operand as a bitwise mask, it must have elements that
33840 // are the same size as the select elements. Ie, the condition operand must
33841 // have already been promoted from the IR select condition type <N x i1>.
33842 // Don't check if the types themselves are equal because that excludes
33843 // vector floating-point selects.
33844 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
33847 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
33848 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
33850 // Try to invert the condition if true value is not all 1s and false value is
33852 if (!TValIsAllOnes && !FValIsAllZeros &&
33853 // Check if the selector will be produced by CMPP*/PCMP*.
33854 Cond.getOpcode() == ISD::SETCC &&
33855 // Check if SETCC has already been promoted.
33856 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
33858 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
33860 if (TValIsAllZeros || FValIsAllOnes) {
33861 SDValue CC = Cond.getOperand(2);
33862 ISD::CondCode NewCC =
33863 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
33864 Cond.getOperand(0).getValueType().isInteger());
33865 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
33867 std::swap(LHS, RHS);
33868 TValIsAllOnes = FValIsAllOnes;
33869 FValIsAllZeros = TValIsAllZeros;
33873 // Cond value must be 'sign splat' to be converted to a logical op.
33874 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
33877 // vselect Cond, 111..., 000... -> Cond
33878 if (TValIsAllOnes && FValIsAllZeros)
33879 return DAG.getBitcast(VT, Cond);
33881 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
33884 // vselect Cond, 111..., X -> or Cond, X
33885 if (TValIsAllOnes) {
33886 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
33887 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
33888 return DAG.getBitcast(VT, Or);
33891 // vselect Cond, X, 000... -> and Cond, X
33892 if (FValIsAllZeros) {
33893 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
33894 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
33895 return DAG.getBitcast(VT, And);
33898 // vselect Cond, 000..., X -> andn Cond, X
33899 if (TValIsAllZeros) {
33900 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
33901 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
33902 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
33903 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
33904 return DAG.getBitcast(VT, AndN);
33910 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
33911 SDValue Cond = N->getOperand(0);
33912 SDValue LHS = N->getOperand(1);
33913 SDValue RHS = N->getOperand(2);
33916 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
33917 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
33918 if (!TrueC || !FalseC)
33921 // Don't do this for crazy integer types.
33922 EVT VT = N->getValueType(0);
33923 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33926 // We're going to use the condition bit in math or logic ops. We could allow
33927 // this with a wider condition value (post-legalization it becomes an i8),
33928 // but if nothing is creating selects that late, it doesn't matter.
33929 if (Cond.getValueType() != MVT::i1)
33932 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
33933 // 3, 5, or 9 with i32/i64, so those get transformed too.
33934 // TODO: For constants that overflow or do not differ by power-of-2 or small
33935 // multiplier, convert to 'and' + 'add'.
33936 const APInt &TrueVal = TrueC->getAPIntValue();
33937 const APInt &FalseVal = FalseC->getAPIntValue();
33939 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
33943 APInt AbsDiff = Diff.abs();
33944 if (AbsDiff.isPowerOf2() ||
33945 ((VT == MVT::i32 || VT == MVT::i64) &&
33946 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
33948 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
33949 // of the condition can usually be folded into a compare predicate, but even
33950 // without that, the sequence should be cheaper than a CMOV alternative.
33951 if (TrueVal.slt(FalseVal)) {
33952 Cond = DAG.getNOT(DL, Cond, MVT::i1);
33953 std::swap(TrueC, FalseC);
33956 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
33957 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
33959 // Multiply condition by the difference if non-one.
33960 if (!AbsDiff.isOneValue())
33961 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
33963 // Add the base if non-zero.
33964 if (!FalseC->isNullValue())
33965 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
33973 /// If this is a *dynamic* select (non-constant condition) and we can match
33974 /// this node with one of the variable blend instructions, restructure the
33975 /// condition so that blends can use the high (sign) bit of each element.
33976 /// This function will also call SimplfiyDemandedBits on already created
33977 /// BLENDV to perform additional simplifications.
33978 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
33979 TargetLowering::DAGCombinerInfo &DCI,
33980 const X86Subtarget &Subtarget) {
33981 SDValue Cond = N->getOperand(0);
33982 if ((N->getOpcode() != ISD::VSELECT &&
33983 N->getOpcode() != X86ISD::BLENDV) ||
33984 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
33987 // Don't optimize before the condition has been transformed to a legal type
33988 // and don't ever optimize vector selects that map to AVX512 mask-registers.
33989 unsigned BitWidth = Cond.getScalarValueSizeInBits();
33990 if (BitWidth < 8 || BitWidth > 64)
33993 // We can only handle the cases where VSELECT is directly legal on the
33994 // subtarget. We custom lower VSELECT nodes with constant conditions and
33995 // this makes it hard to see whether a dynamic VSELECT will correctly
33996 // lower, so we both check the operation's status and explicitly handle the
33997 // cases where a *dynamic* blend will fail even though a constant-condition
33998 // blend could be custom lowered.
33999 // FIXME: We should find a better way to handle this class of problems.
34000 // Potentially, we should combine constant-condition vselect nodes
34001 // pre-legalization into shuffles and not mark as many types as custom
34003 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34004 EVT VT = N->getValueType(0);
34005 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
34007 // FIXME: We don't support i16-element blends currently. We could and
34008 // should support them by making *all* the bits in the condition be set
34009 // rather than just the high bit and using an i8-element blend.
34010 if (VT.getVectorElementType() == MVT::i16)
34012 // Dynamic blending was only available from SSE4.1 onward.
34013 if (VT.is128BitVector() && !Subtarget.hasSSE41())
34015 // Byte blends are only available in AVX2
34016 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
34018 // There are no 512-bit blend instructions that use sign bits.
34019 if (VT.is512BitVector())
34022 // TODO: Add other opcodes eventually lowered into BLEND.
34023 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
34025 if ((UI->getOpcode() != ISD::VSELECT &&
34026 UI->getOpcode() != X86ISD::BLENDV) ||
34027 UI.getOperandNo() != 0)
34030 APInt DemandedMask(APInt::getSignMask(BitWidth));
34032 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
34033 !DCI.isBeforeLegalizeOps());
34034 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
34037 // If we changed the computation somewhere in the DAG, this change will
34038 // affect all users of Cond. Update all the nodes so that we do not use
34039 // the generic VSELECT anymore. Otherwise, we may perform wrong
34040 // optimizations as we messed with the actual expectation for the vector
34042 for (SDNode *U : Cond->uses()) {
34043 if (U->getOpcode() == X86ISD::BLENDV)
34046 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
34047 Cond, U->getOperand(1), U->getOperand(2));
34048 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
34049 DCI.AddToWorklist(U);
34051 DCI.CommitTargetLoweringOpt(TLO);
34052 return SDValue(N, 0);
34055 /// Do target-specific dag combines on SELECT and VSELECT nodes.
34056 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
34057 TargetLowering::DAGCombinerInfo &DCI,
34058 const X86Subtarget &Subtarget) {
34060 SDValue Cond = N->getOperand(0);
34061 SDValue LHS = N->getOperand(1);
34062 SDValue RHS = N->getOperand(2);
34064 // Try simplification again because we use this function to optimize
34065 // BLENDV nodes that are not handled by the generic combiner.
34066 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
34069 EVT VT = LHS.getValueType();
34070 EVT CondVT = Cond.getValueType();
34071 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34073 // Convert vselects with constant condition into shuffles.
34074 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
34075 DCI.isBeforeLegalizeOps()) {
34076 SmallVector<int, 64> Mask;
34077 if (createShuffleMaskFromVSELECT(Mask, Cond))
34078 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
34081 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
34082 // instructions match the semantics of the common C idiom x<y?x:y but not
34083 // x<=y?x:y, because of how they handle negative zero (which can be
34084 // ignored in unsafe-math mode).
34085 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
34086 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
34087 VT != MVT::f80 && VT != MVT::f128 &&
34088 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
34089 (Subtarget.hasSSE2() ||
34090 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
34091 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
34093 unsigned Opcode = 0;
34094 // Check for x CC y ? x : y.
34095 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
34096 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
34100 // Converting this to a min would handle NaNs incorrectly, and swapping
34101 // the operands would cause it to handle comparisons between positive
34102 // and negative zero incorrectly.
34103 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
34104 if (!DAG.getTarget().Options.UnsafeFPMath &&
34105 !(DAG.isKnownNeverZeroFloat(LHS) ||
34106 DAG.isKnownNeverZeroFloat(RHS)))
34108 std::swap(LHS, RHS);
34110 Opcode = X86ISD::FMIN;
34113 // Converting this to a min would handle comparisons between positive
34114 // and negative zero incorrectly.
34115 if (!DAG.getTarget().Options.UnsafeFPMath &&
34116 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
34118 Opcode = X86ISD::FMIN;
34121 // Converting this to a min would handle both negative zeros and NaNs
34122 // incorrectly, but we can swap the operands to fix both.
34123 std::swap(LHS, RHS);
34128 Opcode = X86ISD::FMIN;
34132 // Converting this to a max would handle comparisons between positive
34133 // and negative zero incorrectly.
34134 if (!DAG.getTarget().Options.UnsafeFPMath &&
34135 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
34137 Opcode = X86ISD::FMAX;
34140 // Converting this to a max would handle NaNs incorrectly, and swapping
34141 // the operands would cause it to handle comparisons between positive
34142 // and negative zero incorrectly.
34143 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
34144 if (!DAG.getTarget().Options.UnsafeFPMath &&
34145 !(DAG.isKnownNeverZeroFloat(LHS) ||
34146 DAG.isKnownNeverZeroFloat(RHS)))
34148 std::swap(LHS, RHS);
34150 Opcode = X86ISD::FMAX;
34153 // Converting this to a max would handle both negative zeros and NaNs
34154 // incorrectly, but we can swap the operands to fix both.
34155 std::swap(LHS, RHS);
34160 Opcode = X86ISD::FMAX;
34163 // Check for x CC y ? y : x -- a min/max with reversed arms.
34164 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
34165 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
34169 // Converting this to a min would handle comparisons between positive
34170 // and negative zero incorrectly, and swapping the operands would
34171 // cause it to handle NaNs incorrectly.
34172 if (!DAG.getTarget().Options.UnsafeFPMath &&
34173 !(DAG.isKnownNeverZeroFloat(LHS) ||
34174 DAG.isKnownNeverZeroFloat(RHS))) {
34175 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
34177 std::swap(LHS, RHS);
34179 Opcode = X86ISD::FMIN;
34182 // Converting this to a min would handle NaNs incorrectly.
34183 if (!DAG.getTarget().Options.UnsafeFPMath &&
34184 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
34186 Opcode = X86ISD::FMIN;
34189 // Converting this to a min would handle both negative zeros and NaNs
34190 // incorrectly, but we can swap the operands to fix both.
34191 std::swap(LHS, RHS);
34196 Opcode = X86ISD::FMIN;
34200 // Converting this to a max would handle NaNs incorrectly.
34201 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
34203 Opcode = X86ISD::FMAX;
34206 // Converting this to a max would handle comparisons between positive
34207 // and negative zero incorrectly, and swapping the operands would
34208 // cause it to handle NaNs incorrectly.
34209 if (!DAG.getTarget().Options.UnsafeFPMath &&
34210 !DAG.isKnownNeverZeroFloat(LHS) &&
34211 !DAG.isKnownNeverZeroFloat(RHS)) {
34212 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
34214 std::swap(LHS, RHS);
34216 Opcode = X86ISD::FMAX;
34219 // Converting this to a max would handle both negative zeros and NaNs
34220 // incorrectly, but we can swap the operands to fix both.
34221 std::swap(LHS, RHS);
34226 Opcode = X86ISD::FMAX;
34232 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
34235 // Some mask scalar intrinsics rely on checking if only one bit is set
34236 // and implement it in C code like this:
34237 // A[0] = (U & 1) ? A[0] : W[0];
34238 // This creates some redundant instructions that break pattern matching.
34239 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
34240 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
34241 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
34242 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
34243 SDValue AndNode = Cond.getOperand(0);
34244 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
34245 isNullConstant(Cond.getOperand(1)) &&
34246 isOneConstant(AndNode.getOperand(1))) {
34247 // LHS and RHS swapped due to
34248 // setcc outputting 1 when AND resulted in 0 and vice versa.
34249 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
34250 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
34254 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
34255 // lowering on KNL. In this case we convert it to
34256 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
34257 // The same situation all vectors of i8 and i16 without BWI.
34258 // Make sure we extend these even before type legalization gets a chance to
34259 // split wide vectors.
34260 // Since SKX these selects have a proper lowering.
34261 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
34262 CondVT.getVectorElementType() == MVT::i1 &&
34263 (ExperimentalVectorWideningLegalization ||
34264 VT.getVectorNumElements() > 4) &&
34265 (VT.getVectorElementType() == MVT::i8 ||
34266 VT.getVectorElementType() == MVT::i16)) {
34267 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
34268 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
34271 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
34274 // Canonicalize max and min:
34275 // (x > y) ? x : y -> (x >= y) ? x : y
34276 // (x < y) ? x : y -> (x <= y) ? x : y
34277 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
34278 // the need for an extra compare
34279 // against zero. e.g.
34280 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
34282 // testl %edi, %edi
34284 // cmovgl %edi, %eax
34288 // cmovsl %eax, %edi
34289 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
34290 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
34291 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
34292 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
34297 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
34298 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
34299 Cond.getOperand(0), Cond.getOperand(1), NewCC);
34300 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
34305 // Match VSELECTs into subs with unsigned saturation.
34306 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
34307 // psubus is available in SSE2 for i8 and i16 vectors.
34308 Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
34309 isPowerOf2_32(VT.getVectorNumElements()) &&
34310 (VT.getVectorElementType() == MVT::i8 ||
34311 VT.getVectorElementType() == MVT::i16)) {
34312 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
34314 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
34315 // left side invert the predicate to simplify logic below.
34317 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
34319 CC = ISD::getSetCCInverse(CC, true);
34320 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
34324 if (Other.getNode() && Other->getNumOperands() == 2 &&
34325 Other->getOperand(0) == Cond.getOperand(0)) {
34326 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
34327 SDValue CondRHS = Cond->getOperand(1);
34329 // Look for a general sub with unsigned saturation first.
34330 // x >= y ? x-y : 0 --> subus x, y
34331 // x > y ? x-y : 0 --> subus x, y
34332 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
34333 Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
34334 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
34336 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
34337 if (isa<BuildVectorSDNode>(CondRHS)) {
34338 // If the RHS is a constant we have to reverse the const
34339 // canonicalization.
34340 // x > C-1 ? x+-C : 0 --> subus x, C
34341 // TODO: Handle build_vectors with undef elements.
34342 auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
34343 return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
34345 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
34346 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) {
34347 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
34348 DAG.getConstant(0, DL, VT), OpRHS);
34349 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
34352 // Another special case: If C was a sign bit, the sub has been
34353 // canonicalized into a xor.
34354 // FIXME: Would it be better to use computeKnownBits to determine
34355 // whether it's safe to decanonicalize the xor?
34356 // x s< 0 ? x^C : 0 --> subus x, C
34357 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
34358 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
34359 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
34360 OpRHSConst->getAPIntValue().isSignMask()) {
34361 // Note that we have to rebuild the RHS constant here to ensure we
34362 // don't rely on particular values of undef lanes.
34363 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
34364 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
34372 // Match VSELECTs into add with unsigned saturation.
34373 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
34374 // paddus is available in SSE2 for i8 and i16 vectors.
34375 Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
34376 isPowerOf2_32(VT.getVectorNumElements()) &&
34377 (VT.getVectorElementType() == MVT::i8 ||
34378 VT.getVectorElementType() == MVT::i16)) {
34379 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
34381 SDValue CondLHS = Cond->getOperand(0);
34382 SDValue CondRHS = Cond->getOperand(1);
34384 // Check if one of the arms of the VSELECT is vector with all bits set.
34385 // If it's on the left side invert the predicate to simplify logic below.
34387 if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
34389 CC = ISD::getSetCCInverse(CC, true);
34390 } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
34394 if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
34395 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
34397 // Canonicalize condition operands.
34398 if (CC == ISD::SETUGE) {
34399 std::swap(CondLHS, CondRHS);
34403 // We can test against either of the addition operands.
34404 // x <= x+y ? x+y : ~0 --> addus x, y
34405 // x+y >= x ? x+y : ~0 --> addus x, y
34406 if (CC == ISD::SETULE && Other == CondRHS &&
34407 (OpLHS == CondLHS || OpRHS == CondLHS))
34408 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
34410 if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
34411 CondLHS == OpLHS) {
34412 // If the RHS is a constant we have to reverse the const
34413 // canonicalization.
34414 // x > ~C ? x+C : ~0 --> addus x, C
34415 auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
34416 return Cond->getAPIntValue() == ~Op->getAPIntValue();
34418 if (CC == ISD::SETULE &&
34419 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
34420 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
34425 // Early exit check
34426 if (!TLI.isTypeLegal(VT))
34429 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
34432 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
34435 // Custom action for SELECT MMX
34436 if (VT == MVT::x86mmx) {
34437 LHS = DAG.getBitcast(MVT::i64, LHS);
34438 RHS = DAG.getBitcast(MVT::i64, RHS);
34439 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
34440 return DAG.getBitcast(VT, newSelect);
34447 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
34449 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
34450 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
34451 /// Note that this is only legal for some op/cc combinations.
34452 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
34454 const X86Subtarget &Subtarget) {
34455 // This combine only operates on CMP-like nodes.
34456 if (!(Cmp.getOpcode() == X86ISD::CMP ||
34457 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
34460 // Can't replace the cmp if it has more uses than the one we're looking at.
34461 // FIXME: We would like to be able to handle this, but would need to make sure
34462 // all uses were updated.
34463 if (!Cmp.hasOneUse())
34466 // This only applies to variations of the common case:
34467 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
34468 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
34469 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
34470 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
34471 // Using the proper condcodes (see below), overflow is checked for.
34473 // FIXME: We can generalize both constraints:
34474 // - XOR/OR/AND (if they were made to survive AtomicExpand)
34476 // if the result is compared.
34478 SDValue CmpLHS = Cmp.getOperand(0);
34479 SDValue CmpRHS = Cmp.getOperand(1);
34481 if (!CmpLHS.hasOneUse())
34484 unsigned Opc = CmpLHS.getOpcode();
34485 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
34488 SDValue OpRHS = CmpLHS.getOperand(2);
34489 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
34493 APInt Addend = OpRHSC->getAPIntValue();
34494 if (Opc == ISD::ATOMIC_LOAD_SUB)
34497 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
34501 APInt Comparison = CmpRHSC->getAPIntValue();
34503 // If the addend is the negation of the comparison value, then we can do
34504 // a full comparison by emitting the atomic arithmetic as a locked sub.
34505 if (Comparison == -Addend) {
34506 // The CC is fine, but we need to rewrite the LHS of the comparison as an
34508 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
34509 auto AtomicSub = DAG.getAtomic(
34510 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
34511 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
34512 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
34513 AN->getMemOperand());
34514 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
34515 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
34516 DAG.getUNDEF(CmpLHS.getValueType()));
34517 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
34521 // We can handle comparisons with zero in a number of cases by manipulating
34523 if (!Comparison.isNullValue())
34526 if (CC == X86::COND_S && Addend == 1)
34528 else if (CC == X86::COND_NS && Addend == 1)
34530 else if (CC == X86::COND_G && Addend == -1)
34532 else if (CC == X86::COND_LE && Addend == -1)
34537 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
34538 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
34539 DAG.getUNDEF(CmpLHS.getValueType()));
34540 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
34544 // Check whether a boolean test is testing a boolean value generated by
34545 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
34548 // Simplify the following patterns:
34549 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
34550 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
34551 // to (Op EFLAGS Cond)
34553 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
34554 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
34555 // to (Op EFLAGS !Cond)
34557 // where Op could be BRCOND or CMOV.
34559 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
34560 // This combine only operates on CMP-like nodes.
34561 if (!(Cmp.getOpcode() == X86ISD::CMP ||
34562 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
34565 // Quit if not used as a boolean value.
34566 if (CC != X86::COND_E && CC != X86::COND_NE)
34569 // Check CMP operands. One of them should be 0 or 1 and the other should be
34570 // an SetCC or extended from it.
34571 SDValue Op1 = Cmp.getOperand(0);
34572 SDValue Op2 = Cmp.getOperand(1);
34575 const ConstantSDNode* C = nullptr;
34576 bool needOppositeCond = (CC == X86::COND_E);
34577 bool checkAgainstTrue = false; // Is it a comparison against 1?
34579 if ((C = dyn_cast<ConstantSDNode>(Op1)))
34581 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
34583 else // Quit if all operands are not constants.
34586 if (C->getZExtValue() == 1) {
34587 needOppositeCond = !needOppositeCond;
34588 checkAgainstTrue = true;
34589 } else if (C->getZExtValue() != 0)
34590 // Quit if the constant is neither 0 or 1.
34593 bool truncatedToBoolWithAnd = false;
34594 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
34595 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
34596 SetCC.getOpcode() == ISD::TRUNCATE ||
34597 SetCC.getOpcode() == ISD::AND) {
34598 if (SetCC.getOpcode() == ISD::AND) {
34600 if (isOneConstant(SetCC.getOperand(0)))
34602 if (isOneConstant(SetCC.getOperand(1)))
34606 SetCC = SetCC.getOperand(OpIdx);
34607 truncatedToBoolWithAnd = true;
34609 SetCC = SetCC.getOperand(0);
34612 switch (SetCC.getOpcode()) {
34613 case X86ISD::SETCC_CARRY:
34614 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
34615 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
34616 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
34617 // truncated to i1 using 'and'.
34618 if (checkAgainstTrue && !truncatedToBoolWithAnd)
34620 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
34621 "Invalid use of SETCC_CARRY!");
34623 case X86ISD::SETCC:
34624 // Set the condition code or opposite one if necessary.
34625 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
34626 if (needOppositeCond)
34627 CC = X86::GetOppositeBranchCondition(CC);
34628 return SetCC.getOperand(1);
34629 case X86ISD::CMOV: {
34630 // Check whether false/true value has canonical one, i.e. 0 or 1.
34631 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
34632 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
34633 // Quit if true value is not a constant.
34636 // Quit if false value is not a constant.
34638 SDValue Op = SetCC.getOperand(0);
34639 // Skip 'zext' or 'trunc' node.
34640 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
34641 Op.getOpcode() == ISD::TRUNCATE)
34642 Op = Op.getOperand(0);
34643 // A special case for rdrand/rdseed, where 0 is set if false cond is
34645 if ((Op.getOpcode() != X86ISD::RDRAND &&
34646 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
34649 // Quit if false value is not the constant 0 or 1.
34650 bool FValIsFalse = true;
34651 if (FVal && FVal->getZExtValue() != 0) {
34652 if (FVal->getZExtValue() != 1)
34654 // If FVal is 1, opposite cond is needed.
34655 needOppositeCond = !needOppositeCond;
34656 FValIsFalse = false;
34658 // Quit if TVal is not the constant opposite of FVal.
34659 if (FValIsFalse && TVal->getZExtValue() != 1)
34661 if (!FValIsFalse && TVal->getZExtValue() != 0)
34663 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
34664 if (needOppositeCond)
34665 CC = X86::GetOppositeBranchCondition(CC);
34666 return SetCC.getOperand(3);
34673 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
34675 /// (X86or (X86setcc) (X86setcc))
34676 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
34677 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
34678 X86::CondCode &CC1, SDValue &Flags,
34680 if (Cond->getOpcode() == X86ISD::CMP) {
34681 if (!isNullConstant(Cond->getOperand(1)))
34684 Cond = Cond->getOperand(0);
34689 SDValue SetCC0, SetCC1;
34690 switch (Cond->getOpcode()) {
34691 default: return false;
34698 SetCC0 = Cond->getOperand(0);
34699 SetCC1 = Cond->getOperand(1);
34703 // Make sure we have SETCC nodes, using the same flags value.
34704 if (SetCC0.getOpcode() != X86ISD::SETCC ||
34705 SetCC1.getOpcode() != X86ISD::SETCC ||
34706 SetCC0->getOperand(1) != SetCC1->getOperand(1))
34709 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
34710 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
34711 Flags = SetCC0->getOperand(1);
34715 // When legalizing carry, we create carries via add X, -1
34716 // If that comes from an actual carry, via setcc, we use the
34718 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
34719 if (EFLAGS.getOpcode() == X86ISD::ADD) {
34720 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
34721 SDValue Carry = EFLAGS.getOperand(0);
34722 while (Carry.getOpcode() == ISD::TRUNCATE ||
34723 Carry.getOpcode() == ISD::ZERO_EXTEND ||
34724 Carry.getOpcode() == ISD::SIGN_EXTEND ||
34725 Carry.getOpcode() == ISD::ANY_EXTEND ||
34726 (Carry.getOpcode() == ISD::AND &&
34727 isOneConstant(Carry.getOperand(1))))
34728 Carry = Carry.getOperand(0);
34729 if (Carry.getOpcode() == X86ISD::SETCC ||
34730 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
34731 if (Carry.getConstantOperandVal(0) == X86::COND_B)
34732 return Carry.getOperand(1);
34740 /// Optimize an EFLAGS definition used according to the condition code \p CC
34741 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
34742 /// uses of chain values.
34743 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
34745 const X86Subtarget &Subtarget) {
34746 if (CC == X86::COND_B)
34747 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
34750 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
34752 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
34755 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
34756 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
34757 TargetLowering::DAGCombinerInfo &DCI,
34758 const X86Subtarget &Subtarget) {
34761 SDValue FalseOp = N->getOperand(0);
34762 SDValue TrueOp = N->getOperand(1);
34763 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
34764 SDValue Cond = N->getOperand(3);
34766 // Try to simplify the EFLAGS and condition code operands.
34767 // We can't always do this as FCMOV only supports a subset of X86 cond.
34768 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
34769 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
34770 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
34772 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
34776 // If this is a select between two integer constants, try to do some
34777 // optimizations. Note that the operands are ordered the opposite of SELECT
34779 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
34780 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
34781 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
34782 // larger than FalseC (the false value).
34783 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
34784 CC = X86::GetOppositeBranchCondition(CC);
34785 std::swap(TrueC, FalseC);
34786 std::swap(TrueOp, FalseOp);
34789 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
34790 // This is efficient for any integer data type (including i8/i16) and
34792 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
34793 Cond = getSETCC(CC, Cond, DL, DAG);
34795 // Zero extend the condition if needed.
34796 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
34798 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
34799 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
34800 DAG.getConstant(ShAmt, DL, MVT::i8));
34804 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
34805 // for any integer data type, including i8/i16.
34806 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
34807 Cond = getSETCC(CC, Cond, DL, DAG);
34809 // Zero extend the condition if needed.
34810 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
34811 FalseC->getValueType(0), Cond);
34812 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
34813 SDValue(FalseC, 0));
34817 // Optimize cases that will turn into an LEA instruction. This requires
34818 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
34819 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
34820 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
34821 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
34823 bool isFastMultiplier = false;
34825 switch ((unsigned char)Diff) {
34827 case 1: // result = add base, cond
34828 case 2: // result = lea base( , cond*2)
34829 case 3: // result = lea base(cond, cond*2)
34830 case 4: // result = lea base( , cond*4)
34831 case 5: // result = lea base(cond, cond*4)
34832 case 8: // result = lea base( , cond*8)
34833 case 9: // result = lea base(cond, cond*8)
34834 isFastMultiplier = true;
34839 if (isFastMultiplier) {
34840 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
34841 Cond = getSETCC(CC, Cond, DL ,DAG);
34842 // Zero extend the condition if needed.
34843 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
34845 // Scale the condition by the difference.
34847 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
34848 DAG.getConstant(Diff, DL, Cond.getValueType()));
34850 // Add the base if non-zero.
34851 if (FalseC->getAPIntValue() != 0)
34852 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
34853 SDValue(FalseC, 0));
34860 // Handle these cases:
34861 // (select (x != c), e, c) -> select (x != c), e, x),
34862 // (select (x == c), c, e) -> select (x == c), x, e)
34863 // where the c is an integer constant, and the "select" is the combination
34864 // of CMOV and CMP.
34866 // The rationale for this change is that the conditional-move from a constant
34867 // needs two instructions, however, conditional-move from a register needs
34868 // only one instruction.
34870 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
34871 // some instruction-combining opportunities. This opt needs to be
34872 // postponed as late as possible.
34874 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
34875 // the DCI.xxxx conditions are provided to postpone the optimization as
34876 // late as possible.
34878 ConstantSDNode *CmpAgainst = nullptr;
34879 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
34880 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
34881 !isa<ConstantSDNode>(Cond.getOperand(0))) {
34883 if (CC == X86::COND_NE &&
34884 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
34885 CC = X86::GetOppositeBranchCondition(CC);
34886 std::swap(TrueOp, FalseOp);
34889 if (CC == X86::COND_E &&
34890 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
34891 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
34892 DAG.getConstant(CC, DL, MVT::i8), Cond };
34893 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
34898 // Fold and/or of setcc's to double CMOV:
34899 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
34900 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
34902 // This combine lets us generate:
34903 // cmovcc1 (jcc1 if we don't have CMOV)
34909 // cmovne (jne if we don't have CMOV)
34910 // When we can't use the CMOV instruction, it might increase branch
34912 // When we can use CMOV, or when there is no mispredict, this improves
34913 // throughput and reduces register pressure.
34915 if (CC == X86::COND_NE) {
34917 X86::CondCode CC0, CC1;
34919 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
34921 std::swap(FalseOp, TrueOp);
34922 CC0 = X86::GetOppositeBranchCondition(CC0);
34923 CC1 = X86::GetOppositeBranchCondition(CC1);
34926 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
34928 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
34929 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
34930 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
34935 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
34936 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
34937 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
34938 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
34939 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
34940 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
34941 SDValue Add = TrueOp;
34942 SDValue Const = FalseOp;
34943 // Canonicalize the condition code for easier matching and output.
34944 if (CC == X86::COND_E)
34945 std::swap(Add, Const);
34947 // We might have replaced the constant in the cmov with the LHS of the
34948 // compare. If so change it to the RHS of the compare.
34949 if (Const == Cond.getOperand(0))
34950 Const = Cond.getOperand(1);
34952 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
34953 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
34954 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
34955 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
34956 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
34957 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
34958 EVT VT = N->getValueType(0);
34959 // This should constant fold.
34960 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
34961 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
34962 DAG.getConstant(X86::COND_NE, DL, MVT::i8),
34964 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
34971 /// Different mul shrinking modes.
34972 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
34974 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
34975 EVT VT = N->getOperand(0).getValueType();
34976 if (VT.getScalarSizeInBits() != 32)
34979 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
34980 unsigned SignBits[2] = {1, 1};
34981 bool IsPositive[2] = {false, false};
34982 for (unsigned i = 0; i < 2; i++) {
34983 SDValue Opd = N->getOperand(i);
34985 SignBits[i] = DAG.ComputeNumSignBits(Opd);
34986 IsPositive[i] = DAG.SignBitIsZero(Opd);
34989 bool AllPositive = IsPositive[0] && IsPositive[1];
34990 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
34991 // When ranges are from -128 ~ 127, use MULS8 mode.
34992 if (MinSignBits >= 25)
34994 // When ranges are from 0 ~ 255, use MULU8 mode.
34995 else if (AllPositive && MinSignBits >= 24)
34997 // When ranges are from -32768 ~ 32767, use MULS16 mode.
34998 else if (MinSignBits >= 17)
35000 // When ranges are from 0 ~ 65535, use MULU16 mode.
35001 else if (AllPositive && MinSignBits >= 16)
35008 /// When the operands of vector mul are extended from smaller size values,
35009 /// like i8 and i16, the type of mul may be shrinked to generate more
35010 /// efficient code. Two typical patterns are handled:
35012 /// %2 = sext/zext <N x i8> %1 to <N x i32>
35013 /// %4 = sext/zext <N x i8> %3 to <N x i32>
35014 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
35015 /// %5 = mul <N x i32> %2, %4
35018 /// %2 = zext/sext <N x i16> %1 to <N x i32>
35019 /// %4 = zext/sext <N x i16> %3 to <N x i32>
35020 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
35021 /// %5 = mul <N x i32> %2, %4
35023 /// There are four mul shrinking modes:
35024 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
35025 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
35026 /// generate pmullw+sext32 for it (MULS8 mode).
35027 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
35028 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
35029 /// generate pmullw+zext32 for it (MULU8 mode).
35030 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
35031 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
35032 /// generate pmullw+pmulhw for it (MULS16 mode).
35033 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
35034 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
35035 /// generate pmullw+pmulhuw for it (MULU16 mode).
35036 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
35037 const X86Subtarget &Subtarget) {
35038 // Check for legality
35039 // pmullw/pmulhw are not supported by SSE.
35040 if (!Subtarget.hasSSE2())
35043 // Check for profitability
35044 // pmulld is supported since SSE41. It is better to use pmulld
35045 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
35047 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
35048 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
35052 if (!canReduceVMulWidth(N, DAG, Mode))
35056 SDValue N0 = N->getOperand(0);
35057 SDValue N1 = N->getOperand(1);
35058 EVT VT = N->getOperand(0).getValueType();
35059 unsigned NumElts = VT.getVectorNumElements();
35060 if ((NumElts % 2) != 0)
35063 unsigned RegSize = 128;
35064 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
35065 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
35067 // Shrink the operands of mul.
35068 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
35069 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
35071 if (ExperimentalVectorWideningLegalization ||
35072 NumElts >= OpsVT.getVectorNumElements()) {
35073 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
35074 // lower part is needed.
35075 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
35076 if (Mode == MULU8 || Mode == MULS8)
35077 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
35080 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
35081 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
35082 // the higher part is also needed.
35083 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
35084 ReducedVT, NewN0, NewN1);
35086 // Repack the lower part and higher part result of mul into a wider
35088 // Generate shuffle functioning as punpcklwd.
35089 SmallVector<int, 16> ShuffleMask(NumElts);
35090 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
35091 ShuffleMask[2 * i] = i;
35092 ShuffleMask[2 * i + 1] = i + NumElts;
35095 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
35096 ResLo = DAG.getBitcast(ResVT, ResLo);
35097 // Generate shuffle functioning as punpckhwd.
35098 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
35099 ShuffleMask[2 * i] = i + NumElts / 2;
35100 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
35103 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
35104 ResHi = DAG.getBitcast(ResVT, ResHi);
35105 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
35108 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
35109 // to legalize the mul explicitly because implicit legalization for type
35110 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
35111 // instructions which will not exist when we explicitly legalize it by
35112 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
35113 // <4 x i16> undef).
35115 // Legalize the operands of mul.
35116 // FIXME: We may be able to handle non-concatenated vectors by insertion.
35117 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
35118 if ((RegSize % ReducedSizeInBits) != 0)
35121 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
35122 DAG.getUNDEF(ReducedVT));
35124 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
35126 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
35128 if (Mode == MULU8 || Mode == MULS8) {
35129 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
35131 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
35133 // convert the type of mul result to VT.
35134 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
35135 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
35136 : ISD::SIGN_EXTEND_VECTOR_INREG,
35138 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
35139 DAG.getIntPtrConstant(0, DL));
35142 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
35143 // MULU16/MULS16, both parts are needed.
35144 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
35145 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
35146 OpsVT, NewN0, NewN1);
35148 // Repack the lower part and higher part result of mul into a wider
35149 // result. Make sure the type of mul result is VT.
35150 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
35151 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
35152 Res = DAG.getBitcast(ResVT, Res);
35153 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
35154 DAG.getIntPtrConstant(0, DL));
35157 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
35158 EVT VT, const SDLoc &DL) {
35160 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
35161 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
35162 DAG.getConstant(Mult, DL, VT));
35163 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
35164 DAG.getConstant(Shift, DL, MVT::i8));
35165 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
35170 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
35171 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
35172 DAG.getConstant(Mul1, DL, VT));
35173 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
35174 DAG.getConstant(Mul2, DL, VT));
35175 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
35184 // mul x, 11 => add ((shl (mul x, 5), 1), x)
35185 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
35187 // mul x, 21 => add ((shl (mul x, 5), 2), x)
35188 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
35190 // mul x, 41 => add ((shl (mul x, 5), 3), x)
35191 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
35193 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
35194 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
35195 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
35197 // mul x, 19 => add ((shl (mul x, 9), 1), x)
35198 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
35200 // mul x, 37 => add ((shl (mul x, 9), 2), x)
35201 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
35203 // mul x, 73 => add ((shl (mul x, 9), 3), x)
35204 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
35206 // mul x, 13 => add ((shl (mul x, 3), 2), x)
35207 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
35209 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
35210 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
35212 // mul x, 26 => add ((mul (mul x, 5), 5), x)
35213 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
35215 // mul x, 28 => add ((mul (mul x, 9), 3), x)
35216 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
35218 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
35219 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
35220 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
35223 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
35224 // by a single LEA.
35225 // First check if this a sum of two power of 2s because that's easy. Then
35226 // count how many zeros are up to the first bit.
35227 // TODO: We can do this even without LEA at a cost of two shifts and an add.
35228 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
35229 unsigned ScaleShift = countTrailingZeros(MulAmt);
35230 if (ScaleShift >= 1 && ScaleShift < 4) {
35231 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
35232 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
35233 DAG.getConstant(ShiftAmt, DL, MVT::i8));
35234 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
35235 DAG.getConstant(ScaleShift, DL, MVT::i8));
35236 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
35243 // If the upper 17 bits of each element are zero then we can use PMADDWD,
35244 // which is always at least as quick as PMULLD, except on KNL.
35245 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
35246 const X86Subtarget &Subtarget) {
35247 if (!Subtarget.hasSSE2())
35250 if (Subtarget.isPMADDWDSlow())
35253 EVT VT = N->getValueType(0);
35255 // Only support vXi32 vectors.
35256 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
35259 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
35260 // Also allow v2i32 if it will be widened.
35261 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
35262 if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) ||
35263 DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
35266 SDValue N0 = N->getOperand(0);
35267 SDValue N1 = N->getOperand(1);
35269 // If we are zero extending two steps without SSE4.1, its better to reduce
35270 // the vmul width instead.
35271 if (!Subtarget.hasSSE41() &&
35272 (N0.getOpcode() == ISD::ZERO_EXTEND &&
35273 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
35274 (N1.getOpcode() == ISD::ZERO_EXTEND &&
35275 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
35278 APInt Mask17 = APInt::getHighBitsSet(32, 17);
35279 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
35280 !DAG.MaskedValueIsZero(N0, Mask17))
35283 // Use SplitOpsAndApply to handle AVX splitting.
35284 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35285 ArrayRef<SDValue> Ops) {
35286 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
35287 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
35289 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
35290 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
35294 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
35295 const X86Subtarget &Subtarget) {
35296 if (!Subtarget.hasSSE2())
35299 EVT VT = N->getValueType(0);
35301 // Only support vXi64 vectors.
35302 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
35303 VT.getVectorNumElements() < 2 ||
35304 !isPowerOf2_32(VT.getVectorNumElements()))
35307 SDValue N0 = N->getOperand(0);
35308 SDValue N1 = N->getOperand(1);
35310 // MULDQ returns the 64-bit result of the signed multiplication of the lower
35311 // 32-bits. We can lower with this if the sign bits stretch that far.
35312 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
35313 DAG.ComputeNumSignBits(N1) > 32) {
35314 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35315 ArrayRef<SDValue> Ops) {
35316 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
35318 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
35319 PMULDQBuilder, /*CheckBWI*/false);
35322 // If the upper bits are zero we can use a single pmuludq.
35323 APInt Mask = APInt::getHighBitsSet(64, 32);
35324 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
35325 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35326 ArrayRef<SDValue> Ops) {
35327 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
35329 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
35330 PMULUDQBuilder, /*CheckBWI*/false);
35336 /// Optimize a single multiply with constant into two operations in order to
35337 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
35338 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
35339 TargetLowering::DAGCombinerInfo &DCI,
35340 const X86Subtarget &Subtarget) {
35341 EVT VT = N->getValueType(0);
35343 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
35346 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
35349 if (DCI.isBeforeLegalize() && VT.isVector())
35350 return reduceVMULWidth(N, DAG, Subtarget);
35352 if (!MulConstantOptimization)
35354 // An imul is usually smaller than the alternative sequence.
35355 if (DAG.getMachineFunction().getFunction().optForMinSize())
35358 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
35361 if (VT != MVT::i64 && VT != MVT::i32)
35364 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
35367 if (isPowerOf2_64(C->getZExtValue()))
35370 int64_t SignMulAmt = C->getSExtValue();
35371 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
35372 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
35375 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
35376 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
35377 DAG.getConstant(AbsMulAmt, DL, VT));
35378 if (SignMulAmt < 0)
35379 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
35385 uint64_t MulAmt1 = 0;
35386 uint64_t MulAmt2 = 0;
35387 if ((AbsMulAmt % 9) == 0) {
35389 MulAmt2 = AbsMulAmt / 9;
35390 } else if ((AbsMulAmt % 5) == 0) {
35392 MulAmt2 = AbsMulAmt / 5;
35393 } else if ((AbsMulAmt % 3) == 0) {
35395 MulAmt2 = AbsMulAmt / 3;
35399 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
35401 (isPowerOf2_64(MulAmt2) ||
35402 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
35404 if (isPowerOf2_64(MulAmt2) &&
35405 !(SignMulAmt >= 0 && N->hasOneUse() &&
35406 N->use_begin()->getOpcode() == ISD::ADD))
35407 // If second multiplifer is pow2, issue it first. We want the multiply by
35408 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
35409 // is an add. Only do this for positive multiply amounts since the
35410 // negate would prevent it from being used as an address mode anyway.
35411 std::swap(MulAmt1, MulAmt2);
35413 if (isPowerOf2_64(MulAmt1))
35414 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
35415 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
35417 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
35418 DAG.getConstant(MulAmt1, DL, VT));
35420 if (isPowerOf2_64(MulAmt2))
35421 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
35422 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
35424 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
35425 DAG.getConstant(MulAmt2, DL, VT));
35427 // Negate the result.
35428 if (SignMulAmt < 0)
35429 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
35431 } else if (!Subtarget.slowLEA())
35432 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
35435 assert(C->getZExtValue() != 0 &&
35436 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
35437 "Both cases that could cause potential overflows should have "
35438 "already been handled.");
35439 if (isPowerOf2_64(AbsMulAmt - 1)) {
35440 // (mul x, 2^N + 1) => (add (shl x, N), x)
35441 NewMul = DAG.getNode(
35442 ISD::ADD, DL, VT, N->getOperand(0),
35443 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
35444 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
35446 // To negate, subtract the number from zero
35447 if (SignMulAmt < 0)
35448 NewMul = DAG.getNode(ISD::SUB, DL, VT,
35449 DAG.getConstant(0, DL, VT), NewMul);
35450 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
35451 // (mul x, 2^N - 1) => (sub (shl x, N), x)
35452 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
35453 DAG.getConstant(Log2_64(AbsMulAmt + 1),
35455 // To negate, reverse the operands of the subtract.
35456 if (SignMulAmt < 0)
35457 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
35459 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
35460 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
35461 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
35462 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
35463 DAG.getConstant(Log2_64(AbsMulAmt - 2),
35465 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
35466 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
35467 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
35468 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
35469 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
35470 DAG.getConstant(Log2_64(AbsMulAmt + 2),
35472 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
35473 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
35480 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
35481 SDValue N0 = N->getOperand(0);
35482 SDValue N1 = N->getOperand(1);
35483 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
35484 EVT VT = N0.getValueType();
35486 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
35487 // since the result of setcc_c is all zero's or all ones.
35488 if (VT.isInteger() && !VT.isVector() &&
35489 N1C && N0.getOpcode() == ISD::AND &&
35490 N0.getOperand(1).getOpcode() == ISD::Constant) {
35491 SDValue N00 = N0.getOperand(0);
35492 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
35493 Mask <<= N1C->getAPIntValue();
35494 bool MaskOK = false;
35495 // We can handle cases concerning bit-widening nodes containing setcc_c if
35496 // we carefully interrogate the mask to make sure we are semantics
35498 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
35499 // of the underlying setcc_c operation if the setcc_c was zero extended.
35500 // Consider the following example:
35501 // zext(setcc_c) -> i32 0x0000FFFF
35502 // c1 -> i32 0x0000FFFF
35503 // c2 -> i32 0x00000001
35504 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
35505 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
35506 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
35508 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
35509 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
35511 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
35512 N00.getOpcode() == ISD::ANY_EXTEND) &&
35513 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
35514 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
35516 if (MaskOK && Mask != 0) {
35518 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
35522 // Hardware support for vector shifts is sparse which makes us scalarize the
35523 // vector operations in many cases. Also, on sandybridge ADD is faster than
35525 // (shl V, 1) -> add V,V
35526 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
35527 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
35528 assert(N0.getValueType().isVector() && "Invalid vector shift type");
35529 // We shift all of the values by one. In many cases we do not have
35530 // hardware support for this operation. This is better expressed as an ADD
35532 if (N1SplatC->getAPIntValue() == 1)
35533 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
35539 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
35540 SDValue N0 = N->getOperand(0);
35541 SDValue N1 = N->getOperand(1);
35542 EVT VT = N0.getValueType();
35543 unsigned Size = VT.getSizeInBits();
35545 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
35546 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
35547 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
35548 // depending on sign of (SarConst - [56,48,32,24,16])
35550 // sexts in X86 are MOVs. The MOVs have the same code size
35551 // as above SHIFTs (only SHIFT on 1 has lower code size).
35552 // However the MOVs have 2 advantages to a SHIFT:
35553 // 1. MOVs can write to a register that differs from source
35554 // 2. MOVs accept memory operands
35556 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
35557 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
35558 N0.getOperand(1).getOpcode() != ISD::Constant)
35561 SDValue N00 = N0.getOperand(0);
35562 SDValue N01 = N0.getOperand(1);
35563 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
35564 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
35565 EVT CVT = N1.getValueType();
35567 if (SarConst.isNegative())
35570 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
35571 unsigned ShiftSize = SVT.getSizeInBits();
35572 // skipping types without corresponding sext/zext and
35573 // ShlConst that is not one of [56,48,32,24,16]
35574 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
35578 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
35579 SarConst = SarConst - (Size - ShiftSize);
35582 else if (SarConst.isNegative())
35583 return DAG.getNode(ISD::SHL, DL, VT, NN,
35584 DAG.getConstant(-SarConst, DL, CVT));
35586 return DAG.getNode(ISD::SRA, DL, VT, NN,
35587 DAG.getConstant(SarConst, DL, CVT));
35592 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
35593 TargetLowering::DAGCombinerInfo &DCI) {
35594 SDValue N0 = N->getOperand(0);
35595 SDValue N1 = N->getOperand(1);
35596 EVT VT = N0.getValueType();
35598 // Only do this on the last DAG combine as it can interfere with other
35600 if (!DCI.isAfterLegalizeDAG())
35603 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
35604 // TODO: This is a generic DAG combine that became an x86-only combine to
35605 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
35606 // and-not ('andn').
35607 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
35610 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
35611 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
35612 if (!ShiftC || !AndC)
35615 // If we can shrink the constant mask below 8-bits or 32-bits, then this
35616 // transform should reduce code size. It may also enable secondary transforms
35617 // from improved known-bits analysis or instruction selection.
35618 APInt MaskVal = AndC->getAPIntValue();
35620 // If this can be matched by a zero extend, don't optimize.
35621 if (MaskVal.isMask()) {
35622 unsigned TO = MaskVal.countTrailingOnes();
35623 if (TO >= 8 && isPowerOf2_32(TO))
35627 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
35628 unsigned OldMaskSize = MaskVal.getMinSignedBits();
35629 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
35630 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
35631 (OldMaskSize > 32 && NewMaskSize <= 32)) {
35632 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
35634 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
35635 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
35636 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
35641 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
35642 TargetLowering::DAGCombinerInfo &DCI,
35643 const X86Subtarget &Subtarget) {
35644 if (N->getOpcode() == ISD::SHL)
35645 if (SDValue V = combineShiftLeft(N, DAG))
35648 if (N->getOpcode() == ISD::SRA)
35649 if (SDValue V = combineShiftRightArithmetic(N, DAG))
35652 if (N->getOpcode() == ISD::SRL)
35653 if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
35659 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
35660 TargetLowering::DAGCombinerInfo &DCI,
35661 const X86Subtarget &Subtarget) {
35662 unsigned Opcode = N->getOpcode();
35663 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
35664 "Unexpected shift opcode");
35666 EVT VT = N->getValueType(0);
35667 SDValue N0 = N->getOperand(0);
35668 SDValue N1 = N->getOperand(1);
35669 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
35670 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
35671 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
35672 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
35673 "Unexpected PACKSS/PACKUS input type");
35675 bool IsSigned = (X86ISD::PACKSS == Opcode);
35677 // Constant Folding.
35678 APInt UndefElts0, UndefElts1;
35679 SmallVector<APInt, 32> EltBits0, EltBits1;
35680 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
35681 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
35682 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
35683 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
35684 unsigned NumLanes = VT.getSizeInBits() / 128;
35685 unsigned NumDstElts = VT.getVectorNumElements();
35686 unsigned NumSrcElts = NumDstElts / 2;
35687 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
35688 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
35690 APInt Undefs(NumDstElts, 0);
35691 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
35692 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
35693 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
35694 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
35695 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
35696 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
35698 if (UndefElts[SrcIdx]) {
35699 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
35703 APInt &Val = EltBits[SrcIdx];
35705 // PACKSS: Truncate signed value with signed saturation.
35706 // Source values less than dst minint are saturated to minint.
35707 // Source values greater than dst maxint are saturated to maxint.
35708 if (Val.isSignedIntN(DstBitsPerElt))
35709 Val = Val.trunc(DstBitsPerElt);
35710 else if (Val.isNegative())
35711 Val = APInt::getSignedMinValue(DstBitsPerElt);
35713 Val = APInt::getSignedMaxValue(DstBitsPerElt);
35715 // PACKUS: Truncate signed value with unsigned saturation.
35716 // Source values less than zero are saturated to zero.
35717 // Source values greater than dst maxuint are saturated to maxuint.
35718 if (Val.isIntN(DstBitsPerElt))
35719 Val = Val.trunc(DstBitsPerElt);
35720 else if (Val.isNegative())
35721 Val = APInt::getNullValue(DstBitsPerElt);
35723 Val = APInt::getAllOnesValue(DstBitsPerElt);
35725 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
35729 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
35732 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
35733 // truncate to create a larger truncate.
35734 if (Subtarget.hasAVX512() &&
35735 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
35736 N0.getOperand(0).getValueType() == MVT::v8i32) {
35737 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
35739 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
35740 if (Subtarget.hasVLX())
35741 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
35743 // Widen input to v16i32 so we can truncate that.
35745 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
35746 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
35747 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
35751 // Attempt to combine as shuffle.
35754 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35755 /*HasVarMask*/ false,
35756 /*AllowVarMask*/ true, DAG, Subtarget))
35762 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
35763 TargetLowering::DAGCombinerInfo &DCI,
35764 const X86Subtarget &Subtarget) {
35765 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
35766 X86ISD::VSRL == N->getOpcode()) &&
35767 "Unexpected shift opcode");
35768 EVT VT = N->getValueType(0);
35770 // Shift zero -> zero.
35771 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
35772 return DAG.getConstant(0, SDLoc(N), VT);
35774 APInt KnownUndef, KnownZero;
35775 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35776 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
35777 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
35779 return SDValue(N, 0);
35784 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
35785 TargetLowering::DAGCombinerInfo &DCI,
35786 const X86Subtarget &Subtarget) {
35787 unsigned Opcode = N->getOpcode();
35788 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
35789 X86ISD::VSRLI == Opcode) &&
35790 "Unexpected shift opcode");
35791 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
35792 EVT VT = N->getValueType(0);
35793 SDValue N0 = N->getOperand(0);
35794 SDValue N1 = N->getOperand(1);
35795 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
35796 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
35797 "Unexpected value type");
35798 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
35800 // Out of range logical bit shifts are guaranteed to be zero.
35801 // Out of range arithmetic bit shifts splat the sign bit.
35802 unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
35803 if (ShiftVal >= NumBitsPerElt) {
35805 return DAG.getConstant(0, SDLoc(N), VT);
35807 ShiftVal = NumBitsPerElt - 1;
35810 // Shift N0 by zero -> N0.
35814 // Shift zero -> zero.
35815 if (ISD::isBuildVectorAllZeros(N0.getNode()))
35816 return DAG.getConstant(0, SDLoc(N), VT);
35818 // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
35819 // clamped to (NumBitsPerElt - 1).
35820 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
35821 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
35822 unsigned NewShiftVal = ShiftVal + ShiftVal2;
35823 if (NewShiftVal >= NumBitsPerElt)
35824 NewShiftVal = NumBitsPerElt - 1;
35825 return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
35826 DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
35829 // We can decode 'whole byte' logical bit shifts as shuffles.
35830 if (LogicalShift && (ShiftVal % 8) == 0) {
35832 if (SDValue Res = combineX86ShufflesRecursively(
35833 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35834 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
35838 // Constant Folding.
35840 SmallVector<APInt, 32> EltBits;
35841 if (N->isOnlyUserOf(N0.getNode()) &&
35842 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
35843 assert(EltBits.size() == VT.getVectorNumElements() &&
35844 "Unexpected shift value type");
35845 for (APInt &Elt : EltBits) {
35846 if (X86ISD::VSHLI == Opcode)
35848 else if (X86ISD::VSRAI == Opcode)
35849 Elt.ashrInPlace(ShiftVal);
35851 Elt.lshrInPlace(ShiftVal);
35853 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
35856 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35857 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
35858 APInt::getAllOnesValue(NumBitsPerElt), DCI))
35859 return SDValue(N, 0);
35864 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
35865 TargetLowering::DAGCombinerInfo &DCI,
35866 const X86Subtarget &Subtarget) {
35868 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
35869 (N->getOpcode() == X86ISD::PINSRW &&
35870 N->getValueType(0) == MVT::v8i16)) &&
35871 "Unexpected vector insertion");
35873 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
35876 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35877 /*HasVarMask*/ false,
35878 /*AllowVarMask*/ true, DAG, Subtarget))
35884 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
35885 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
35886 /// OR -> CMPNEQSS.
35887 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
35888 TargetLowering::DAGCombinerInfo &DCI,
35889 const X86Subtarget &Subtarget) {
35892 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
35893 // we're requiring SSE2 for both.
35894 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
35895 SDValue N0 = N->getOperand(0);
35896 SDValue N1 = N->getOperand(1);
35897 SDValue CMP0 = N0->getOperand(1);
35898 SDValue CMP1 = N1->getOperand(1);
35901 // The SETCCs should both refer to the same CMP.
35902 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
35905 SDValue CMP00 = CMP0->getOperand(0);
35906 SDValue CMP01 = CMP0->getOperand(1);
35907 EVT VT = CMP00.getValueType();
35909 if (VT == MVT::f32 || VT == MVT::f64) {
35910 bool ExpectingFlags = false;
35911 // Check for any users that want flags:
35912 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
35913 !ExpectingFlags && UI != UE; ++UI)
35914 switch (UI->getOpcode()) {
35919 ExpectingFlags = true;
35921 case ISD::CopyToReg:
35922 case ISD::SIGN_EXTEND:
35923 case ISD::ZERO_EXTEND:
35924 case ISD::ANY_EXTEND:
35928 if (!ExpectingFlags) {
35929 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
35930 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
35932 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
35933 X86::CondCode tmp = cc0;
35938 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
35939 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
35940 // FIXME: need symbolic constants for these magic numbers.
35941 // See X86ATTInstPrinter.cpp:printSSECC().
35942 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
35943 if (Subtarget.hasAVX512()) {
35945 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
35946 DAG.getConstant(x86cc, DL, MVT::i8));
35947 // Need to fill with zeros to ensure the bitcast will produce zeroes
35948 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
35949 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
35950 DAG.getConstant(0, DL, MVT::v16i1),
35951 FSetCC, DAG.getIntPtrConstant(0, DL));
35952 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
35953 N->getSimpleValueType(0));
35955 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
35956 CMP00.getValueType(), CMP00, CMP01,
35957 DAG.getConstant(x86cc, DL,
35960 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
35961 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
35963 if (is64BitFP && !Subtarget.is64Bit()) {
35964 // On a 32-bit target, we cannot bitcast the 64-bit float to a
35965 // 64-bit integer, since that's not a legal type. Since
35966 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
35967 // bits, but can do this little dance to extract the lowest 32 bits
35968 // and work with those going forward.
35969 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
35971 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
35972 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
35973 Vector32, DAG.getIntPtrConstant(0, DL));
35977 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
35978 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
35979 DAG.getConstant(1, DL, IntVT));
35980 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
35982 return OneBitOfTruth;
35990 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
35991 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
35992 assert(N->getOpcode() == ISD::AND);
35994 MVT VT = N->getSimpleValueType(0);
35995 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35999 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
36000 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
36001 if (N0.getOpcode() == ISD::XOR &&
36002 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
36003 X = N0.getOperand(0);
36005 } else if (N1.getOpcode() == ISD::XOR &&
36006 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
36007 X = N1.getOperand(0);
36012 X = DAG.getBitcast(VT, X);
36013 Y = DAG.getBitcast(VT, Y);
36014 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
36017 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
36018 // register. In most cases we actually compare or select YMM-sized registers
36019 // and mixing the two types creates horrible code. This method optimizes
36020 // some of the transition sequences.
36021 // Even with AVX-512 this is still useful for removing casts around logical
36022 // operations on vXi1 mask types.
36023 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
36024 const X86Subtarget &Subtarget) {
36025 EVT VT = N->getValueType(0);
36026 assert(VT.isVector() && "Expected vector type");
36028 assert((N->getOpcode() == ISD::ANY_EXTEND ||
36029 N->getOpcode() == ISD::ZERO_EXTEND ||
36030 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
36032 SDValue Narrow = N->getOperand(0);
36033 EVT NarrowVT = Narrow.getValueType();
36035 if (Narrow->getOpcode() != ISD::XOR &&
36036 Narrow->getOpcode() != ISD::AND &&
36037 Narrow->getOpcode() != ISD::OR)
36040 SDValue N0 = Narrow->getOperand(0);
36041 SDValue N1 = Narrow->getOperand(1);
36044 // The Left side has to be a trunc.
36045 if (N0.getOpcode() != ISD::TRUNCATE)
36048 // The type of the truncated inputs.
36049 if (N0->getOperand(0).getValueType() != VT)
36052 // The right side has to be a 'trunc' or a constant vector.
36053 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
36054 N1.getOperand(0).getValueType() == VT;
36056 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
36059 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36061 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
36064 // Set N0 and N1 to hold the inputs to the new wide operation.
36065 N0 = N0->getOperand(0);
36067 N1 = N1->getOperand(0);
36069 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
36071 // Generate the wide operation.
36072 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
36073 unsigned Opcode = N->getOpcode();
36075 default: llvm_unreachable("Unexpected opcode");
36076 case ISD::ANY_EXTEND:
36078 case ISD::ZERO_EXTEND:
36079 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
36080 case ISD::SIGN_EXTEND:
36081 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
36082 Op, DAG.getValueType(NarrowVT));
36086 /// If both input operands of a logic op are being cast from floating point
36087 /// types, try to convert this into a floating point logic node to avoid
36088 /// unnecessary moves from SSE to integer registers.
36089 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
36090 const X86Subtarget &Subtarget) {
36091 unsigned FPOpcode = ISD::DELETED_NODE;
36092 if (N->getOpcode() == ISD::AND)
36093 FPOpcode = X86ISD::FAND;
36094 else if (N->getOpcode() == ISD::OR)
36095 FPOpcode = X86ISD::FOR;
36096 else if (N->getOpcode() == ISD::XOR)
36097 FPOpcode = X86ISD::FXOR;
36099 assert(FPOpcode != ISD::DELETED_NODE &&
36100 "Unexpected input node for FP logic conversion");
36102 EVT VT = N->getValueType(0);
36103 SDValue N0 = N->getOperand(0);
36104 SDValue N1 = N->getOperand(1);
36106 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
36107 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
36108 (Subtarget.hasSSE2() && VT == MVT::i64))) {
36109 SDValue N00 = N0.getOperand(0);
36110 SDValue N10 = N1.getOperand(0);
36111 EVT N00Type = N00.getValueType();
36112 EVT N10Type = N10.getValueType();
36113 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
36114 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
36115 return DAG.getBitcast(VT, FPLogic);
36121 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
36122 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
36123 /// with a shift-right to eliminate loading the vector constant mask value.
36124 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
36125 const X86Subtarget &Subtarget) {
36126 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
36127 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
36128 EVT VT0 = Op0.getValueType();
36129 EVT VT1 = Op1.getValueType();
36131 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
36135 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
36136 !SplatVal.isMask())
36139 // Don't prevent creation of ANDN.
36140 if (isBitwiseNot(Op0))
36143 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
36146 unsigned EltBitWidth = VT0.getScalarSizeInBits();
36147 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
36151 unsigned ShiftVal = SplatVal.countTrailingOnes();
36152 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
36153 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
36154 return DAG.getBitcast(N->getValueType(0), Shift);
36157 // Get the index node from the lowered DAG of a GEP IR instruction with one
36158 // indexing dimension.
36159 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
36160 if (Ld->isIndexed())
36163 SDValue Base = Ld->getBasePtr();
36165 if (Base.getOpcode() != ISD::ADD)
36168 SDValue ShiftedIndex = Base.getOperand(0);
36170 if (ShiftedIndex.getOpcode() != ISD::SHL)
36173 return ShiftedIndex.getOperand(0);
36177 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
36178 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
36179 switch (VT.getSizeInBits()) {
36180 default: return false;
36181 case 64: return Subtarget.is64Bit() ? true : false;
36182 case 32: return true;
36188 // This function recognizes cases where X86 bzhi instruction can replace and
36189 // 'and-load' sequence.
36190 // In case of loading integer value from an array of constants which is defined
36193 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
36195 // then applying a bitwise and on the result with another input.
36196 // It's equivalent to performing bzhi (zero high bits) on the input, with the
36197 // same index of the load.
36198 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
36199 const X86Subtarget &Subtarget) {
36200 MVT VT = Node->getSimpleValueType(0);
36203 // Check if subtarget has BZHI instruction for the node's type
36204 if (!hasBZHI(Subtarget, VT))
36207 // Try matching the pattern for both operands.
36208 for (unsigned i = 0; i < 2; i++) {
36209 SDValue N = Node->getOperand(i);
36210 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
36212 // continue if the operand is not a load instruction
36216 const Value *MemOp = Ld->getMemOperand()->getValue();
36221 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
36222 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
36223 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
36225 Constant *Init = GV->getInitializer();
36226 Type *Ty = Init->getType();
36227 if (!isa<ConstantDataArray>(Init) ||
36228 !Ty->getArrayElementType()->isIntegerTy() ||
36229 Ty->getArrayElementType()->getScalarSizeInBits() !=
36230 VT.getSizeInBits() ||
36231 Ty->getArrayNumElements() >
36232 Ty->getArrayElementType()->getScalarSizeInBits())
36235 // Check if the array's constant elements are suitable to our case.
36236 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
36237 bool ConstantsMatch = true;
36238 for (uint64_t j = 0; j < ArrayElementCount; j++) {
36239 ConstantInt *Elem =
36240 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
36241 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
36242 ConstantsMatch = false;
36246 if (!ConstantsMatch)
36249 // Do the transformation (For 32-bit type):
36250 // -> (and (load arr[idx]), inp)
36251 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
36252 // that will be replaced with one bzhi instruction.
36253 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
36254 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
36256 // Get the Node which indexes into the array.
36257 SDValue Index = getIndexFromUnindexedLoad(Ld);
36260 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
36262 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
36263 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
36265 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
36266 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
36268 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
36276 // Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
36277 // Turn it into series of XORs and a setnp.
36278 static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
36279 const X86Subtarget &Subtarget) {
36280 EVT VT = N->getValueType(0);
36282 // We only support 64-bit and 32-bit. 64-bit requires special handling
36283 // unless the 64-bit popcnt instruction is legal.
36284 if (VT != MVT::i32 && VT != MVT::i64)
36287 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36288 if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
36291 SDValue N0 = N->getOperand(0);
36292 SDValue N1 = N->getOperand(1);
36294 // LHS needs to be a single use CTPOP.
36295 if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
36298 // RHS needs to be 1.
36299 if (!isOneConstant(N1))
36303 SDValue X = N0.getOperand(0);
36305 // If this is 64-bit, its always best to xor the two 32-bit pieces together
36306 // even if we have popcnt.
36307 if (VT == MVT::i64) {
36308 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
36309 DAG.getNode(ISD::SRL, DL, VT, X,
36310 DAG.getConstant(32, DL, MVT::i8)));
36311 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
36312 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
36313 // Generate a 32-bit parity idiom. This will bring us back here if we need
36314 // to expand it too.
36315 SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
36316 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
36317 DAG.getConstant(1, DL, MVT::i32));
36318 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
36320 assert(VT == MVT::i32 && "Unexpected VT!");
36322 // Xor the high and low 16-bits together using a 32-bit operation.
36323 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
36324 DAG.getConstant(16, DL, MVT::i8));
36325 X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
36327 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
36328 // This should allow an h-reg to be used to save a shift.
36329 // FIXME: We only get an h-reg in 32-bit mode.
36330 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
36331 DAG.getNode(ISD::SRL, DL, VT, X,
36332 DAG.getConstant(8, DL, MVT::i8)));
36333 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
36334 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
36335 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
36337 // Copy the inverse of the parity flag into a register with setcc.
36338 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
36339 // Zero extend to original type.
36340 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
36343 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
36344 TargetLowering::DAGCombinerInfo &DCI,
36345 const X86Subtarget &Subtarget) {
36346 EVT VT = N->getValueType(0);
36348 // If this is SSE1 only convert to FAND to avoid scalarization.
36349 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
36350 return DAG.getBitcast(
36351 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
36352 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
36353 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
36356 // Use a 32-bit and+zext if upper bits known zero.
36357 if (VT == MVT::i64 && Subtarget.is64Bit() &&
36358 !isa<ConstantSDNode>(N->getOperand(1))) {
36359 APInt HiMask = APInt::getHighBitsSet(64, 32);
36360 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
36361 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
36363 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
36364 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
36365 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
36366 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
36370 // This must be done before legalization has expanded the ctpop.
36371 if (SDValue V = combineParity(N, DAG, Subtarget))
36374 if (DCI.isBeforeLegalizeOps())
36377 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
36380 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
36383 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
36386 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
36389 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
36392 // Attempt to recursively combine a bitmask AND with shuffles.
36393 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
36395 if (SDValue Res = combineX86ShufflesRecursively(
36396 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
36397 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
36401 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
36402 if ((VT.getScalarSizeInBits() % 8) == 0 &&
36403 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
36404 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
36405 SDValue BitMask = N->getOperand(1);
36406 SDValue SrcVec = N->getOperand(0).getOperand(0);
36407 EVT SrcVecVT = SrcVec.getValueType();
36409 // Check that the constant bitmask masks whole bytes.
36411 SmallVector<APInt, 64> EltBits;
36412 if (VT == SrcVecVT.getScalarType() &&
36413 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
36414 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
36415 llvm::all_of(EltBits, [](APInt M) {
36416 return M.isNullValue() || M.isAllOnesValue();
36418 unsigned NumElts = SrcVecVT.getVectorNumElements();
36419 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
36420 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
36422 // Create a root shuffle mask from the byte mask and the extracted index.
36423 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
36424 for (unsigned i = 0; i != Scale; ++i) {
36427 int VecIdx = Scale * Idx + i;
36428 ShuffleMask[VecIdx] =
36429 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
36432 if (SDValue Shuffle = combineX86ShufflesRecursively(
36433 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
36434 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
36435 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
36436 N->getOperand(0).getOperand(1));
36443 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
36444 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
36445 if (N->getOpcode() != ISD::OR)
36448 SDValue N0 = N->getOperand(0);
36449 SDValue N1 = N->getOperand(1);
36451 // Canonicalize AND to LHS.
36452 if (N1.getOpcode() == ISD::AND)
36455 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
36456 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
36459 Mask = N1.getOperand(0);
36460 X = N1.getOperand(1);
36462 // Check to see if the mask appeared in both the AND and ANDNP.
36463 if (N0.getOperand(0) == Mask)
36464 Y = N0.getOperand(1);
36465 else if (N0.getOperand(1) == Mask)
36466 Y = N0.getOperand(0);
36470 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
36471 // ANDNP combine allows other combines to happen that prevent matching.
36476 // (or (and (m, y), (pandn m, x)))
36478 // (vselect m, x, y)
36479 // As a special case, try to fold:
36480 // (or (and (m, (sub 0, x)), (pandn m, x)))
36482 // (sub (xor X, M), M)
36483 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
36484 const X86Subtarget &Subtarget) {
36485 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
36487 EVT VT = N->getValueType(0);
36488 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
36489 (VT.is256BitVector() && Subtarget.hasInt256())))
36492 SDValue X, Y, Mask;
36493 if (!matchLogicBlend(N, X, Y, Mask))
36496 // Validate that X, Y, and Mask are bitcasts, and see through them.
36497 Mask = peekThroughBitcasts(Mask);
36498 X = peekThroughBitcasts(X);
36499 Y = peekThroughBitcasts(Y);
36501 EVT MaskVT = Mask.getValueType();
36502 unsigned EltBits = MaskVT.getScalarSizeInBits();
36504 // TODO: Attempt to handle floating point cases as well?
36505 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
36511 // (or (and (M, (sub 0, X)), (pandn M, X)))
36512 // which is a special case of vselect:
36513 // (vselect M, (sub 0, X), X)
36515 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
36516 // We know that, if fNegate is 0 or 1:
36517 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
36519 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
36520 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
36521 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
36522 // This lets us transform our vselect to:
36523 // (add (xor X, M), (and M, 1))
36525 // (sub (xor X, M), M)
36526 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
36527 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
36528 auto IsNegV = [](SDNode *N, SDValue V) {
36529 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
36530 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
36533 if (IsNegV(Y.getNode(), X))
36535 else if (IsNegV(X.getNode(), Y))
36539 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
36540 SDValue SubOp2 = Mask;
36542 // If the negate was on the false side of the select, then
36543 // the operands of the SUB need to be swapped. PR 27251.
36544 // This is because the pattern being matched above is
36545 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
36546 // but if the pattern matched was
36547 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
36548 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
36549 // pattern also needs to be a negation of the replacement pattern above.
36550 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
36551 // sub accomplishes the negation of the replacement pattern.
36553 std::swap(SubOp1, SubOp2);
36555 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
36556 return DAG.getBitcast(VT, Res);
36560 // PBLENDVB is only available on SSE 4.1.
36561 if (!Subtarget.hasSSE41())
36564 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
36566 X = DAG.getBitcast(BlendVT, X);
36567 Y = DAG.getBitcast(BlendVT, Y);
36568 Mask = DAG.getBitcast(BlendVT, Mask);
36569 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
36570 return DAG.getBitcast(VT, Mask);
36573 // Helper function for combineOrCmpEqZeroToCtlzSrl
36577 // srl(ctlz x), log2(bitsize(x))
36578 // Input pattern is checked by caller.
36579 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
36580 SelectionDAG &DAG) {
36581 SDValue Cmp = Op.getOperand(1);
36582 EVT VT = Cmp.getOperand(0).getValueType();
36583 unsigned Log2b = Log2_32(VT.getSizeInBits());
36585 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
36586 // The result of the shift is true or false, and on X86, the 32-bit
36587 // encoding of shr and lzcnt is more desirable.
36588 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
36589 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
36590 DAG.getConstant(Log2b, dl, MVT::i8));
36591 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
36594 // Try to transform:
36595 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
36597 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
36598 // Will also attempt to match more generic cases, eg:
36599 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
36600 // Only applies if the target supports the FastLZCNT feature.
36601 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
36602 TargetLowering::DAGCombinerInfo &DCI,
36603 const X86Subtarget &Subtarget) {
36604 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
36607 auto isORCandidate = [](SDValue N) {
36608 return (N->getOpcode() == ISD::OR && N->hasOneUse());
36611 // Check the zero extend is extending to 32-bit or more. The code generated by
36612 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
36613 // instructions to clear the upper bits.
36614 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
36615 !isORCandidate(N->getOperand(0)))
36618 // Check the node matches: setcc(eq, cmp 0)
36619 auto isSetCCCandidate = [](SDValue N) {
36620 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
36621 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
36622 N->getOperand(1).getOpcode() == X86ISD::CMP &&
36623 isNullConstant(N->getOperand(1).getOperand(1)) &&
36624 N->getOperand(1).getValueType().bitsGE(MVT::i32);
36627 SDNode *OR = N->getOperand(0).getNode();
36628 SDValue LHS = OR->getOperand(0);
36629 SDValue RHS = OR->getOperand(1);
36631 // Save nodes matching or(or, setcc(eq, cmp 0)).
36632 SmallVector<SDNode *, 2> ORNodes;
36633 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
36634 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
36635 ORNodes.push_back(OR);
36636 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
36637 LHS = OR->getOperand(0);
36638 RHS = OR->getOperand(1);
36641 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
36642 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
36643 !isORCandidate(SDValue(OR, 0)))
36646 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
36648 // or(srl(ctlz),srl(ctlz)).
36649 // The dag combiner can then fold it into:
36650 // srl(or(ctlz, ctlz)).
36651 EVT VT = OR->getValueType(0);
36652 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
36653 SDValue Ret, NewRHS;
36654 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
36655 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
36660 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
36661 while (ORNodes.size() > 0) {
36662 OR = ORNodes.pop_back_val();
36663 LHS = OR->getOperand(0);
36664 RHS = OR->getOperand(1);
36665 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
36666 if (RHS->getOpcode() == ISD::OR)
36667 std::swap(LHS, RHS);
36668 EVT VT = OR->getValueType(0);
36669 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
36672 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
36676 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
36681 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
36682 TargetLowering::DAGCombinerInfo &DCI,
36683 const X86Subtarget &Subtarget) {
36684 SDValue N0 = N->getOperand(0);
36685 SDValue N1 = N->getOperand(1);
36686 EVT VT = N->getValueType(0);
36688 // If this is SSE1 only convert to FOR to avoid scalarization.
36689 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
36690 return DAG.getBitcast(MVT::v4i32,
36691 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
36692 DAG.getBitcast(MVT::v4f32, N0),
36693 DAG.getBitcast(MVT::v4f32, N1)));
36696 if (DCI.isBeforeLegalizeOps())
36699 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
36702 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
36705 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
36708 // Attempt to recursively combine an OR of shuffles.
36709 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
36711 if (SDValue Res = combineX86ShufflesRecursively(
36712 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
36713 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
36717 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
36720 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
36721 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
36722 unsigned Bits = VT.getScalarSizeInBits();
36724 // SHLD/SHRD instructions have lower register pressure, but on some
36725 // platforms they have higher latency than the equivalent
36726 // series of shifts/or that would otherwise be generated.
36727 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
36728 // have higher latencies and we are not optimizing for size.
36729 if (!OptForSize && Subtarget.isSHLDSlow())
36732 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
36734 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
36736 if (!N0.hasOneUse() || !N1.hasOneUse())
36739 SDValue ShAmt0 = N0.getOperand(1);
36740 if (ShAmt0.getValueType() != MVT::i8)
36742 SDValue ShAmt1 = N1.getOperand(1);
36743 if (ShAmt1.getValueType() != MVT::i8)
36746 // Peek through any modulo shift masks.
36748 if (ShAmt0.getOpcode() == ISD::AND &&
36749 isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
36750 ShAmt0.getConstantOperandVal(1) == (Bits - 1)) {
36752 ShAmt0 = ShAmt0.getOperand(0);
36755 if (ShAmt1.getOpcode() == ISD::AND &&
36756 isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
36757 ShAmt1.getConstantOperandVal(1) == (Bits - 1)) {
36759 ShAmt1 = ShAmt1.getOperand(0);
36762 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
36763 ShAmt0 = ShAmt0.getOperand(0);
36764 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
36765 ShAmt1 = ShAmt1.getOperand(0);
36768 unsigned Opc = X86ISD::SHLD;
36769 SDValue Op0 = N0.getOperand(0);
36770 SDValue Op1 = N1.getOperand(0);
36771 if (ShAmt0.getOpcode() == ISD::SUB ||
36772 ShAmt0.getOpcode() == ISD::XOR) {
36773 Opc = X86ISD::SHRD;
36774 std::swap(Op0, Op1);
36775 std::swap(ShAmt0, ShAmt1);
36776 std::swap(ShMsk0, ShMsk1);
36779 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
36780 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
36781 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
36782 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
36783 // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C )
36784 // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C )
36785 if (ShAmt1.getOpcode() == ISD::SUB) {
36786 SDValue Sum = ShAmt1.getOperand(0);
36787 if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
36788 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
36789 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
36790 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
36791 if ((SumC->getAPIntValue() == Bits ||
36792 (SumC->getAPIntValue() == 0 && ShMsk1)) &&
36793 ShAmt1Op1 == ShAmt0)
36794 return DAG.getNode(Opc, DL, VT, Op0, Op1,
36795 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
36797 } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
36798 auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
36799 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
36800 return DAG.getNode(Opc, DL, VT,
36801 N0.getOperand(0), N1.getOperand(0),
36802 DAG.getNode(ISD::TRUNCATE, DL,
36804 } else if (ShAmt1.getOpcode() == ISD::XOR) {
36805 SDValue Mask = ShAmt1.getOperand(1);
36806 if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
36807 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
36808 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
36809 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
36810 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
36811 if (MaskC->getSExtValue() == (Bits - 1) &&
36812 (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
36813 if (Op1.getOpcode() == InnerShift &&
36814 isa<ConstantSDNode>(Op1.getOperand(1)) &&
36815 Op1.getConstantOperandVal(1) == 1) {
36816 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
36817 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
36819 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
36820 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
36821 Op1.getOperand(0) == Op1.getOperand(1)) {
36822 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
36823 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
36832 /// Try to turn tests against the signbit in the form of:
36833 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
36836 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
36837 // This is only worth doing if the output type is i8 or i1.
36838 EVT ResultType = N->getValueType(0);
36839 if (ResultType != MVT::i8 && ResultType != MVT::i1)
36842 SDValue N0 = N->getOperand(0);
36843 SDValue N1 = N->getOperand(1);
36845 // We should be performing an xor against a truncated shift.
36846 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
36849 // Make sure we are performing an xor against one.
36850 if (!isOneConstant(N1))
36853 // SetCC on x86 zero extends so only act on this if it's a logical shift.
36854 SDValue Shift = N0.getOperand(0);
36855 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
36858 // Make sure we are truncating from one of i16, i32 or i64.
36859 EVT ShiftTy = Shift.getValueType();
36860 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
36863 // Make sure the shift amount extracts the sign bit.
36864 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
36865 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
36868 // Create a greater-than comparison against -1.
36869 // N.B. Using SETGE against 0 works but we want a canonical looking
36870 // comparison, using SETGT matches up with what TranslateX86CC.
36872 SDValue ShiftOp = Shift.getOperand(0);
36873 EVT ShiftOpTy = ShiftOp.getValueType();
36874 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36875 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
36876 *DAG.getContext(), ResultType);
36877 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
36878 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
36879 if (SetCCResultType != ResultType)
36880 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
36884 /// Turn vector tests of the signbit in the form of:
36885 /// xor (sra X, elt_size(X)-1), -1
36889 /// This should be called before type legalization because the pattern may not
36890 /// persist after that.
36891 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
36892 const X86Subtarget &Subtarget) {
36893 EVT VT = N->getValueType(0);
36894 if (!VT.isSimple())
36897 switch (VT.getSimpleVT().SimpleTy) {
36898 default: return SDValue();
36901 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
36902 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
36906 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
36909 // There must be a shift right algebraic before the xor, and the xor must be a
36910 // 'not' operation.
36911 SDValue Shift = N->getOperand(0);
36912 SDValue Ones = N->getOperand(1);
36913 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
36914 !ISD::isBuildVectorAllOnes(Ones.getNode()))
36917 // The shift should be smearing the sign bit across each vector element.
36918 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
36922 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
36923 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
36924 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
36927 // Create a greater-than comparison against -1. We don't use the more obvious
36928 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
36929 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
36932 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
36933 /// is valid for the given \p Subtarget.
36934 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
36935 const X86Subtarget &Subtarget) {
36936 if (!Subtarget.hasAVX512())
36939 // FIXME: Scalar type may be supported if we move it to vector register.
36940 if (!SrcVT.isVector())
36943 EVT SrcElVT = SrcVT.getScalarType();
36944 EVT DstElVT = DstVT.getScalarType();
36945 if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
36947 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
36948 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
36952 /// Detect patterns of truncation with unsigned saturation:
36954 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
36955 /// Return the source value x to be truncated or SDValue() if the pattern was
36958 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
36959 /// where C1 >= 0 and C2 is unsigned max of destination type.
36961 /// (truncate (smax (smin (x, C2), C1)) to dest_type)
36962 /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
36964 /// These two patterns are equivalent to:
36965 /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
36966 /// So return the smax(x, C1) value to be truncated or SDValue() if the
36967 /// pattern was not matched.
36968 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
36970 EVT InVT = In.getValueType();
36972 // Saturation with truncation. We truncate from InVT to VT.
36973 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
36974 "Unexpected types for truncate operation");
36976 // Match min/max and return limit value as a parameter.
36977 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
36978 if (V.getOpcode() == Opcode &&
36979 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
36980 return V.getOperand(0);
36985 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
36986 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
36987 // the element size of the destination type.
36988 if (C2.isMask(VT.getScalarSizeInBits()))
36991 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
36992 if (MatchMinMax(SMin, ISD::SMAX, C1))
36993 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
36996 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
36997 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
36998 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
37000 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
37006 /// Detect patterns of truncation with signed saturation:
37007 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
37008 /// signed_max_of_dest_type)) to dest_type)
37010 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
37011 /// signed_min_of_dest_type)) to dest_type).
37012 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
37013 /// Return the source value to be truncated or SDValue() if the pattern was not
37015 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
37016 unsigned NumDstBits = VT.getScalarSizeInBits();
37017 unsigned NumSrcBits = In.getScalarValueSizeInBits();
37018 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
37020 auto MatchMinMax = [](SDValue V, unsigned Opcode,
37021 const APInt &Limit) -> SDValue {
37023 if (V.getOpcode() == Opcode &&
37024 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
37025 return V.getOperand(0);
37029 APInt SignedMax, SignedMin;
37031 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
37032 SignedMin = APInt(NumSrcBits, 0);
37034 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
37035 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
37038 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
37039 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
37042 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
37043 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
37049 /// Detect a pattern of truncation with signed saturation.
37050 /// The types should allow to use VPMOVSS* instruction on AVX512.
37051 /// Return the source value to be truncated or SDValue() if the pattern was not
37053 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
37054 const X86Subtarget &Subtarget,
37055 const TargetLowering &TLI) {
37056 if (!TLI.isTypeLegal(In.getValueType()))
37058 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
37060 return detectSSatPattern(In, VT);
37063 /// Detect a pattern of truncation with saturation:
37064 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
37065 /// The types should allow to use VPMOVUS* instruction on AVX512.
37066 /// Return the source value to be truncated or SDValue() if the pattern was not
37068 static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
37070 const X86Subtarget &Subtarget,
37071 const TargetLowering &TLI) {
37072 if (!TLI.isTypeLegal(In.getValueType()))
37074 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
37076 return detectUSatPattern(In, VT, DAG, DL);
37079 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
37081 const X86Subtarget &Subtarget) {
37082 EVT SVT = VT.getScalarType();
37083 EVT InVT = In.getValueType();
37084 EVT InSVT = InVT.getScalarType();
37085 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37086 if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
37087 isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
37088 if (auto SSatVal = detectSSatPattern(In, VT))
37089 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
37090 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
37091 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
37093 if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
37094 !Subtarget.hasAVX512() &&
37095 (SVT == MVT::i8 || SVT == MVT::i16) &&
37096 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
37097 if (auto USatVal = detectSSatPattern(In, VT, true)) {
37098 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
37099 if (SVT == MVT::i8 && InSVT == MVT::i32) {
37100 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
37101 VT.getVectorNumElements());
37102 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
37105 return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
37107 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
37108 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
37111 if (auto SSatVal = detectSSatPattern(In, VT))
37112 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
37118 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
37119 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
37120 /// X86ISD::AVG instruction.
37121 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
37122 const X86Subtarget &Subtarget,
37124 if (!VT.isVector())
37126 EVT InVT = In.getValueType();
37127 unsigned NumElems = VT.getVectorNumElements();
37129 EVT ScalarVT = VT.getVectorElementType();
37130 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
37131 NumElems >= 2 && isPowerOf2_32(NumElems)))
37134 // InScalarVT is the intermediate type in AVG pattern and it should be greater
37135 // than the original input type (i8/i16).
37136 EVT InScalarVT = InVT.getVectorElementType();
37137 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
37140 if (!Subtarget.hasSSE2())
37143 // Detect the following pattern:
37145 // %1 = zext <N x i8> %a to <N x i32>
37146 // %2 = zext <N x i8> %b to <N x i32>
37147 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
37148 // %4 = add nuw nsw <N x i32> %3, %2
37149 // %5 = lshr <N x i32> %N, <i32 1 x N>
37150 // %6 = trunc <N x i32> %5 to <N x i8>
37152 // In AVX512, the last instruction can also be a trunc store.
37153 if (In.getOpcode() != ISD::SRL)
37156 // A lambda checking the given SDValue is a constant vector and each element
37157 // is in the range [Min, Max].
37158 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
37159 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
37160 if (!BV || !BV->isConstant())
37162 for (SDValue Op : V->ops()) {
37163 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
37166 const APInt &Val = C->getAPIntValue();
37167 if (Val.ult(Min) || Val.ugt(Max))
37173 // Check if each element of the vector is left-shifted by one.
37174 auto LHS = In.getOperand(0);
37175 auto RHS = In.getOperand(1);
37176 if (!IsConstVectorInRange(RHS, 1, 1))
37178 if (LHS.getOpcode() != ISD::ADD)
37181 // Detect a pattern of a + b + 1 where the order doesn't matter.
37182 SDValue Operands[3];
37183 Operands[0] = LHS.getOperand(0);
37184 Operands[1] = LHS.getOperand(1);
37186 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
37187 ArrayRef<SDValue> Ops) {
37188 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
37191 // Take care of the case when one of the operands is a constant vector whose
37192 // element is in the range [1, 256].
37193 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
37194 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
37195 Operands[0].getOperand(0).getValueType() == VT) {
37196 // The pattern is detected. Subtract one from the constant vector, then
37197 // demote it and emit X86ISD::AVG instruction.
37198 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
37199 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
37200 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
37201 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
37202 { Operands[0].getOperand(0), Operands[1] },
37206 if (Operands[0].getOpcode() == ISD::ADD)
37207 std::swap(Operands[0], Operands[1]);
37208 else if (Operands[1].getOpcode() != ISD::ADD)
37210 Operands[2] = Operands[1].getOperand(0);
37211 Operands[1] = Operands[1].getOperand(1);
37213 // Now we have three operands of two additions. Check that one of them is a
37214 // constant vector with ones, and the other two are promoted from i8/i16.
37215 for (int i = 0; i < 3; ++i) {
37216 if (!IsConstVectorInRange(Operands[i], 1, 1))
37218 std::swap(Operands[i], Operands[2]);
37220 // Check if Operands[0] and Operands[1] are results of type promotion.
37221 for (int j = 0; j < 2; ++j)
37222 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
37223 Operands[j].getOperand(0).getValueType() != VT)
37226 // The pattern is detected, emit X86ISD::AVG instruction(s).
37227 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
37228 { Operands[0].getOperand(0),
37229 Operands[1].getOperand(0) }, AVGBuilder);
37235 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
37236 TargetLowering::DAGCombinerInfo &DCI,
37237 const X86Subtarget &Subtarget) {
37238 LoadSDNode *Ld = cast<LoadSDNode>(N);
37239 EVT RegVT = Ld->getValueType(0);
37240 EVT MemVT = Ld->getMemoryVT();
37242 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37244 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
37245 // into two 16-byte operations. Also split non-temporal aligned loads on
37246 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
37247 ISD::LoadExtType Ext = Ld->getExtensionType();
37249 unsigned AddressSpace = Ld->getAddressSpace();
37250 unsigned Alignment = Ld->getAlignment();
37251 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
37252 Ext == ISD::NON_EXTLOAD &&
37253 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
37254 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
37255 AddressSpace, Alignment, &Fast) && !Fast))) {
37256 unsigned NumElems = RegVT.getVectorNumElements();
37260 SDValue Ptr = Ld->getBasePtr();
37262 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
37265 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
37266 Alignment, Ld->getMemOperand()->getFlags());
37268 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
37270 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
37271 Ld->getPointerInfo().getWithOffset(16),
37272 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
37273 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
37275 Load2.getValue(1));
37277 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
37278 return DCI.CombineTo(N, NewVec, TF, true);
37284 /// If V is a build vector of boolean constants and exactly one of those
37285 /// constants is true, return the operand index of that true element.
37286 /// Otherwise, return -1.
37287 static int getOneTrueElt(SDValue V) {
37288 // This needs to be a build vector of booleans.
37289 // TODO: Checking for the i1 type matches the IR definition for the mask,
37290 // but the mask check could be loosened to i8 or other types. That might
37291 // also require checking more than 'allOnesValue'; eg, the x86 HW
37292 // instructions only require that the MSB is set for each mask element.
37293 // The ISD::MSTORE comments/definition do not specify how the mask operand
37295 auto *BV = dyn_cast<BuildVectorSDNode>(V);
37296 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
37299 int TrueIndex = -1;
37300 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
37301 for (unsigned i = 0; i < NumElts; ++i) {
37302 const SDValue &Op = BV->getOperand(i);
37305 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
37308 if (ConstNode->getAPIntValue().isAllOnesValue()) {
37309 // If we already found a one, this is too many.
37310 if (TrueIndex >= 0)
37318 /// Given a masked memory load/store operation, return true if it has one mask
37319 /// bit set. If it has one mask bit set, then also return the memory address of
37320 /// the scalar element to load/store, the vector index to insert/extract that
37321 /// scalar element, and the alignment for the scalar memory access.
37322 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
37323 SelectionDAG &DAG, SDValue &Addr,
37324 SDValue &Index, unsigned &Alignment) {
37325 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
37326 if (TrueMaskElt < 0)
37329 // Get the address of the one scalar element that is specified by the mask
37330 // using the appropriate offset from the base pointer.
37331 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
37332 Addr = MaskedOp->getBasePtr();
37333 if (TrueMaskElt != 0) {
37334 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
37335 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
37338 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
37339 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
37343 /// If exactly one element of the mask is set for a non-extending masked load,
37344 /// it is a scalar load and vector insert.
37345 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
37346 /// mask have already been optimized in IR, so we don't bother with those here.
37348 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
37349 TargetLowering::DAGCombinerInfo &DCI) {
37350 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
37351 // However, some target hooks may need to be added to know when the transform
37352 // is profitable. Endianness would also have to be considered.
37354 SDValue Addr, VecIndex;
37355 unsigned Alignment;
37356 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
37359 // Load the one scalar element that is specified by the mask using the
37360 // appropriate offset from the base pointer.
37362 EVT VT = ML->getValueType(0);
37363 EVT EltVT = VT.getVectorElementType();
37365 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
37366 Alignment, ML->getMemOperand()->getFlags());
37368 // Insert the loaded element into the appropriate place in the vector.
37369 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
37370 ML->getPassThru(), Load, VecIndex);
37371 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
37375 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
37376 TargetLowering::DAGCombinerInfo &DCI) {
37377 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
37381 EVT VT = ML->getValueType(0);
37383 // If we are loading the first and last elements of a vector, it is safe and
37384 // always faster to load the whole vector. Replace the masked load with a
37385 // vector load and select.
37386 unsigned NumElts = VT.getVectorNumElements();
37387 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
37388 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
37389 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
37390 if (LoadFirstElt && LoadLastElt) {
37391 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
37392 ML->getMemOperand());
37393 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
37394 ML->getPassThru());
37395 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
37398 // Convert a masked load with a constant mask into a masked load and a select.
37399 // This allows the select operation to use a faster kind of select instruction
37400 // (for example, vblendvps -> vblendps).
37402 // Don't try this if the pass-through operand is already undefined. That would
37403 // cause an infinite loop because that's what we're about to create.
37404 if (ML->getPassThru().isUndef())
37407 // The new masked load has an undef pass-through operand. The select uses the
37408 // original pass-through operand.
37409 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
37410 ML->getMask(), DAG.getUNDEF(VT),
37411 ML->getMemoryVT(), ML->getMemOperand(),
37412 ML->getExtensionType());
37413 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
37414 ML->getPassThru());
37416 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
37419 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
37420 TargetLowering::DAGCombinerInfo &DCI,
37421 const X86Subtarget &Subtarget) {
37422 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
37424 // TODO: Expanding load with constant mask may be optimized as well.
37425 if (Mld->isExpandingLoad())
37428 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
37429 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
37431 // TODO: Do some AVX512 subsets benefit from this transform?
37432 if (!Subtarget.hasAVX512())
37433 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
37437 if (Mld->getExtensionType() != ISD::SEXTLOAD)
37440 // Resolve extending loads.
37441 EVT VT = Mld->getValueType(0);
37442 unsigned NumElems = VT.getVectorNumElements();
37443 EVT LdVT = Mld->getMemoryVT();
37446 assert(LdVT != VT && "Cannot extend to the same type");
37447 unsigned ToSz = VT.getScalarSizeInBits();
37448 unsigned FromSz = LdVT.getScalarSizeInBits();
37449 // From/To sizes and ElemCount must be pow of two.
37450 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
37451 "Unexpected size for extending masked load");
37453 unsigned SizeRatio = ToSz / FromSz;
37454 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
37456 // Create a type on which we perform the shuffle.
37457 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
37458 LdVT.getScalarType(), NumElems*SizeRatio);
37459 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
37461 // Convert PassThru value.
37462 SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
37463 if (!Mld->getPassThru().isUndef()) {
37464 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
37465 for (unsigned i = 0; i != NumElems; ++i)
37466 ShuffleVec[i] = i * SizeRatio;
37468 // Can't shuffle using an illegal type.
37469 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
37470 "WideVecVT should be legal");
37471 WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
37472 DAG.getUNDEF(WideVecVT), ShuffleVec);
37475 // Prepare the new mask.
37477 SDValue Mask = Mld->getMask();
37478 if (Mask.getValueType() == VT) {
37479 // Mask and original value have the same type.
37480 NewMask = DAG.getBitcast(WideVecVT, Mask);
37481 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
37482 for (unsigned i = 0; i != NumElems; ++i)
37483 ShuffleVec[i] = i * SizeRatio;
37484 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
37485 ShuffleVec[i] = NumElems * SizeRatio;
37486 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
37487 DAG.getConstant(0, dl, WideVecVT),
37490 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
37491 unsigned WidenNumElts = NumElems*SizeRatio;
37492 unsigned MaskNumElts = VT.getVectorNumElements();
37493 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
37496 unsigned NumConcat = WidenNumElts / MaskNumElts;
37497 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
37498 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
37500 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
37503 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
37504 Mld->getBasePtr(), NewMask, WidePassThru,
37505 Mld->getMemoryVT(), Mld->getMemOperand(),
37507 SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG);
37508 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
37511 /// If exactly one element of the mask is set for a non-truncating masked store,
37512 /// it is a vector extract and scalar store.
37513 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
37514 /// mask have already been optimized in IR, so we don't bother with those here.
37515 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
37516 SelectionDAG &DAG) {
37517 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
37518 // However, some target hooks may need to be added to know when the transform
37519 // is profitable. Endianness would also have to be considered.
37521 SDValue Addr, VecIndex;
37522 unsigned Alignment;
37523 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
37526 // Extract the one scalar element that is actually being stored.
37528 EVT VT = MS->getValue().getValueType();
37529 EVT EltVT = VT.getVectorElementType();
37530 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
37531 MS->getValue(), VecIndex);
37533 // Store that element at the appropriate offset from the base pointer.
37534 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
37535 Alignment, MS->getMemOperand()->getFlags());
37538 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
37539 TargetLowering::DAGCombinerInfo &DCI,
37540 const X86Subtarget &Subtarget) {
37541 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
37542 if (Mst->isCompressingStore())
37545 EVT VT = Mst->getValue().getValueType();
37546 if (!Mst->isTruncatingStore()) {
37547 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
37548 return ScalarStore;
37550 // If the mask value has been legalized to a non-boolean vector, try to
37551 // simplify ops leading up to it. We only demand the MSB of each lane.
37552 SDValue Mask = Mst->getMask();
37553 if (Mask.getScalarValueSizeInBits() != 1) {
37554 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37555 APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
37556 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
37557 return SDValue(N, 0);
37560 // TODO: AVX512 targets should also be able to simplify something like the
37561 // pattern above, but that pattern will be different. It will either need to
37562 // match setcc more generally or match PCMPGTM later (in tablegen?).
37567 // Resolve truncating stores.
37568 unsigned NumElems = VT.getVectorNumElements();
37569 EVT StVT = Mst->getMemoryVT();
37572 assert(StVT != VT && "Cannot truncate to the same type");
37573 unsigned FromSz = VT.getScalarSizeInBits();
37574 unsigned ToSz = StVT.getScalarSizeInBits();
37576 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37578 // The truncating store is legal in some cases. For example
37579 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
37580 // are designated for truncate store.
37581 // In this case we don't need any further transformations.
37582 if (TLI.isTruncStoreLegal(VT, StVT))
37585 // From/To sizes and ElemCount must be pow of two.
37586 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
37587 "Unexpected size for truncating masked store");
37588 // We are going to use the original vector elt for storing.
37589 // Accumulated smaller vector elements must be a multiple of the store size.
37590 assert (((NumElems * FromSz) % ToSz) == 0 &&
37591 "Unexpected ratio for truncating masked store");
37593 unsigned SizeRatio = FromSz / ToSz;
37594 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
37596 // Create a type on which we perform the shuffle.
37597 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
37598 StVT.getScalarType(), NumElems*SizeRatio);
37600 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
37602 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
37603 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
37604 for (unsigned i = 0; i != NumElems; ++i)
37605 ShuffleVec[i] = i * SizeRatio;
37607 // Can't shuffle using an illegal type.
37608 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
37609 "WideVecVT should be legal");
37611 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
37612 DAG.getUNDEF(WideVecVT),
37616 SDValue Mask = Mst->getMask();
37617 if (Mask.getValueType() == VT) {
37618 // Mask and original value have the same type.
37619 NewMask = DAG.getBitcast(WideVecVT, Mask);
37620 for (unsigned i = 0; i != NumElems; ++i)
37621 ShuffleVec[i] = i * SizeRatio;
37622 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
37623 ShuffleVec[i] = NumElems*SizeRatio;
37624 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
37625 DAG.getConstant(0, dl, WideVecVT),
37628 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
37629 unsigned WidenNumElts = NumElems*SizeRatio;
37630 unsigned MaskNumElts = VT.getVectorNumElements();
37631 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
37634 unsigned NumConcat = WidenNumElts / MaskNumElts;
37635 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
37636 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
37638 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
37641 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
37642 Mst->getBasePtr(), NewMask, StVT,
37643 Mst->getMemOperand(), false);
37646 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
37647 const X86Subtarget &Subtarget) {
37648 StoreSDNode *St = cast<StoreSDNode>(N);
37649 EVT VT = St->getValue().getValueType();
37650 EVT StVT = St->getMemoryVT();
37652 SDValue StoredVal = St->getOperand(1);
37653 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37655 // Convert a store of vXi1 into a store of iX and a bitcast.
37656 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
37657 VT.getVectorElementType() == MVT::i1) {
37659 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
37660 StoredVal = DAG.getBitcast(NewVT, StoredVal);
37662 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
37663 St->getPointerInfo(), St->getAlignment(),
37664 St->getMemOperand()->getFlags());
37667 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
37668 // This will avoid a copy to k-register.
37669 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
37670 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37671 StoredVal.getOperand(0).getValueType() == MVT::i8) {
37672 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
37673 St->getBasePtr(), St->getPointerInfo(),
37674 St->getAlignment(), St->getMemOperand()->getFlags());
37677 // Widen v2i1/v4i1 stores to v8i1.
37678 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
37679 Subtarget.hasAVX512()) {
37680 unsigned NumConcats = 8 / VT.getVectorNumElements();
37681 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
37682 Ops[0] = StoredVal;
37683 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
37684 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
37685 St->getPointerInfo(), St->getAlignment(),
37686 St->getMemOperand()->getFlags());
37689 // Turn vXi1 stores of constants into a scalar store.
37690 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
37691 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
37692 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
37693 // If its a v64i1 store without 64-bit support, we need two stores.
37694 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
37695 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
37696 StoredVal->ops().slice(0, 32));
37697 Lo = combinevXi1ConstantToInteger(Lo, DAG);
37698 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
37699 StoredVal->ops().slice(32, 32));
37700 Hi = combinevXi1ConstantToInteger(Hi, DAG);
37702 unsigned Alignment = St->getAlignment();
37704 SDValue Ptr0 = St->getBasePtr();
37705 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
37708 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
37709 Alignment, St->getMemOperand()->getFlags());
37711 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
37712 St->getPointerInfo().getWithOffset(4),
37713 MinAlign(Alignment, 4U),
37714 St->getMemOperand()->getFlags());
37715 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
37718 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
37719 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
37720 St->getPointerInfo(), St->getAlignment(),
37721 St->getMemOperand()->getFlags());
37724 // If we are saving a concatenation of two XMM registers and 32-byte stores
37725 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
37727 unsigned AddressSpace = St->getAddressSpace();
37728 unsigned Alignment = St->getAlignment();
37729 if (VT.is256BitVector() && StVT == VT &&
37730 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
37731 AddressSpace, Alignment, &Fast) &&
37733 unsigned NumElems = VT.getVectorNumElements();
37737 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
37738 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
37740 SDValue Ptr0 = St->getBasePtr();
37741 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
37744 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
37745 Alignment, St->getMemOperand()->getFlags());
37747 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
37748 St->getPointerInfo().getWithOffset(16),
37749 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
37750 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
37753 // Optimize trunc store (of multiple scalars) to shuffle and store.
37754 // First, pack all of the elements in one place. Next, store to memory
37755 // in fewer chunks.
37756 if (St->isTruncatingStore() && VT.isVector()) {
37757 // Check if we can detect an AVG pattern from the truncation. If yes,
37758 // replace the trunc store by a normal store with the result of X86ISD::AVG
37760 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
37762 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
37763 St->getPointerInfo(), St->getAlignment(),
37764 St->getMemOperand()->getFlags());
37766 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37768 detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
37770 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
37771 dl, Val, St->getBasePtr(),
37772 St->getMemoryVT(), St->getMemOperand(), DAG);
37773 if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
37774 DAG, dl, Subtarget, TLI))
37775 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
37776 dl, Val, St->getBasePtr(),
37777 St->getMemoryVT(), St->getMemOperand(), DAG);
37779 unsigned NumElems = VT.getVectorNumElements();
37780 assert(StVT != VT && "Cannot truncate to the same type");
37781 unsigned FromSz = VT.getScalarSizeInBits();
37782 unsigned ToSz = StVT.getScalarSizeInBits();
37784 // The truncating store is legal in some cases. For example
37785 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
37786 // are designated for truncate store.
37787 // In this case we don't need any further transformations.
37788 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
37791 // From, To sizes and ElemCount must be pow of two
37792 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
37793 // We are going to use the original vector elt for storing.
37794 // Accumulated smaller vector elements must be a multiple of the store size.
37795 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
37797 unsigned SizeRatio = FromSz / ToSz;
37799 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
37801 // Create a type on which we perform the shuffle
37802 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
37803 StVT.getScalarType(), NumElems*SizeRatio);
37805 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
37807 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
37808 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
37809 for (unsigned i = 0; i != NumElems; ++i)
37810 ShuffleVec[i] = i * SizeRatio;
37812 // Can't shuffle using an illegal type.
37813 if (!TLI.isTypeLegal(WideVecVT))
37816 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
37817 DAG.getUNDEF(WideVecVT),
37819 // At this point all of the data is stored at the bottom of the
37820 // register. We now need to save it to mem.
37822 // Find the largest store unit
37823 MVT StoreType = MVT::i8;
37824 for (MVT Tp : MVT::integer_valuetypes()) {
37825 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
37829 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
37830 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
37831 (64 <= NumElems * ToSz))
37832 StoreType = MVT::f64;
37834 // Bitcast the original vector into a vector of store-size units
37835 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
37836 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
37837 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
37838 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
37839 SmallVector<SDValue, 8> Chains;
37840 SDValue Ptr = St->getBasePtr();
37842 // Perform one or more big stores into memory.
37843 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
37844 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
37845 StoreType, ShuffWide,
37846 DAG.getIntPtrConstant(i, dl));
37848 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
37849 St->getAlignment(), St->getMemOperand()->getFlags());
37850 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
37851 Chains.push_back(Ch);
37854 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
37857 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
37858 // the FP state in cases where an emms may be missing.
37859 // A preferable solution to the general problem is to figure out the right
37860 // places to insert EMMS. This qualifies as a quick hack.
37862 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
37863 if (VT.getSizeInBits() != 64)
37866 const Function &F = DAG.getMachineFunction().getFunction();
37867 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
37869 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
37870 if ((VT.isVector() ||
37871 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
37872 isa<LoadSDNode>(St->getValue()) &&
37873 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
37874 St->getChain().hasOneUse() && !St->isVolatile()) {
37875 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
37876 SmallVector<SDValue, 8> Ops;
37878 if (!ISD::isNormalLoad(Ld))
37881 // If this is not the MMX case, i.e. we are just turning i64 load/store
37882 // into f64 load/store, avoid the transformation if there are multiple
37883 // uses of the loaded value.
37884 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
37889 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
37890 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
37892 if (Subtarget.is64Bit() || F64IsLegal) {
37893 MVT LdVT = (Subtarget.is64Bit() &&
37894 (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64;
37895 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
37896 Ld->getMemOperand());
37898 // Make sure new load is placed in same chain order.
37899 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
37900 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
37901 St->getMemOperand());
37904 // Otherwise, lower to two pairs of 32-bit loads / stores.
37905 SDValue LoAddr = Ld->getBasePtr();
37906 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
37908 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
37909 Ld->getPointerInfo(), Ld->getAlignment(),
37910 Ld->getMemOperand()->getFlags());
37911 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
37912 Ld->getPointerInfo().getWithOffset(4),
37913 MinAlign(Ld->getAlignment(), 4),
37914 Ld->getMemOperand()->getFlags());
37915 // Make sure new loads are placed in same chain order.
37916 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
37917 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
37919 LoAddr = St->getBasePtr();
37920 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
37923 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
37924 St->getAlignment(), St->getMemOperand()->getFlags());
37925 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
37926 St->getPointerInfo().getWithOffset(4),
37927 MinAlign(St->getAlignment(), 4),
37928 St->getMemOperand()->getFlags());
37929 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
37932 // This is similar to the above case, but here we handle a scalar 64-bit
37933 // integer store that is extracted from a vector on a 32-bit target.
37934 // If we have SSE2, then we can treat it like a floating-point double
37935 // to get past legalization. The execution dependencies fixup pass will
37936 // choose the optimal machine instruction for the store if this really is
37937 // an integer or v2f32 rather than an f64.
37938 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
37939 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
37940 SDValue OldExtract = St->getOperand(1);
37941 SDValue ExtOp0 = OldExtract.getOperand(0);
37942 unsigned VecSize = ExtOp0.getValueSizeInBits();
37943 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
37944 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
37945 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
37946 BitCast, OldExtract.getOperand(1));
37947 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
37948 St->getPointerInfo(), St->getAlignment(),
37949 St->getMemOperand()->getFlags());
37955 /// Return 'true' if this vector operation is "horizontal"
37956 /// and return the operands for the horizontal operation in LHS and RHS. A
37957 /// horizontal operation performs the binary operation on successive elements
37958 /// of its first operand, then on successive elements of its second operand,
37959 /// returning the resulting values in a vector. For example, if
37960 /// A = < float a0, float a1, float a2, float a3 >
37962 /// B = < float b0, float b1, float b2, float b3 >
37963 /// then the result of doing a horizontal operation on A and B is
37964 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
37965 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
37966 /// A horizontal-op B, for some already available A and B, and if so then LHS is
37967 /// set to A, RHS to B, and the routine returns 'true'.
37968 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
37969 // If either operand is undef, bail out. The binop should be simplified.
37970 if (LHS.isUndef() || RHS.isUndef())
37973 // Look for the following pattern:
37974 // A = < float a0, float a1, float a2, float a3 >
37975 // B = < float b0, float b1, float b2, float b3 >
37977 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
37978 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
37979 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
37980 // which is A horizontal-op B.
37982 // At least one of the operands should be a vector shuffle.
37983 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
37984 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
37987 MVT VT = LHS.getSimpleValueType();
37988 assert((VT.is128BitVector() || VT.is256BitVector()) &&
37989 "Unsupported vector type for horizontal add/sub");
37991 // View LHS in the form
37992 // LHS = VECTOR_SHUFFLE A, B, LMask
37993 // If LHS is not a shuffle, then pretend it is the identity shuffle:
37994 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
37995 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
37996 unsigned NumElts = VT.getVectorNumElements();
37998 SmallVector<int, 16> LMask(NumElts);
37999 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
38000 if (!LHS.getOperand(0).isUndef())
38001 A = LHS.getOperand(0);
38002 if (!LHS.getOperand(1).isUndef())
38003 B = LHS.getOperand(1);
38004 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
38005 llvm::copy(Mask, LMask.begin());
38008 for (unsigned i = 0; i != NumElts; ++i)
38012 // Likewise, view RHS in the form
38013 // RHS = VECTOR_SHUFFLE C, D, RMask
38015 SmallVector<int, 16> RMask(NumElts);
38016 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
38017 if (!RHS.getOperand(0).isUndef())
38018 C = RHS.getOperand(0);
38019 if (!RHS.getOperand(1).isUndef())
38020 D = RHS.getOperand(1);
38021 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
38022 llvm::copy(Mask, RMask.begin());
38025 for (unsigned i = 0; i != NumElts; ++i)
38029 // If A and B occur in reverse order in RHS, then canonicalize by commuting
38030 // RHS operands and shuffle mask.
38033 ShuffleVectorSDNode::commuteMask(RMask);
38035 // Check that the shuffles are both shuffling the same vectors.
38036 if (!(A == C && B == D))
38039 // LHS and RHS are now:
38040 // LHS = shuffle A, B, LMask
38041 // RHS = shuffle A, B, RMask
38042 // Check that the masks correspond to performing a horizontal operation.
38043 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
38044 // so we just repeat the inner loop if this is a 256-bit op.
38045 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
38046 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
38047 assert((NumEltsPer128BitChunk % 2 == 0) &&
38048 "Vector type should have an even number of elements in each lane");
38049 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
38050 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
38051 // Ignore undefined components.
38052 int LIdx = LMask[i + j], RIdx = RMask[i + j];
38053 if (LIdx < 0 || RIdx < 0 ||
38054 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
38055 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
38058 // The low half of the 128-bit result must choose from A.
38059 // The high half of the 128-bit result must choose from B,
38060 // unless B is undef. In that case, we are always choosing from A.
38061 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
38062 unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
38064 // Check that successive elements are being operated on. If not, this is
38065 // not a horizontal operation.
38066 int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
38067 if (!(LIdx == Index && RIdx == Index + 1) &&
38068 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
38073 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
38074 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
38078 /// Do target-specific dag combines on floating-point adds/subs.
38079 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
38080 const X86Subtarget &Subtarget) {
38081 EVT VT = N->getValueType(0);
38082 SDValue LHS = N->getOperand(0);
38083 SDValue RHS = N->getOperand(1);
38084 bool IsFadd = N->getOpcode() == ISD::FADD;
38085 auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
38086 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
38088 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
38089 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
38090 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
38091 isHorizontalBinOp(LHS, RHS, IsFadd) &&
38092 shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget))
38093 return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
38098 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
38100 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
38101 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
38102 /// anything that is guaranteed to be transformed by DAGCombiner.
38103 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
38104 const X86Subtarget &Subtarget,
38106 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
38107 SDValue Src = N->getOperand(0);
38108 unsigned Opcode = Src.getOpcode();
38109 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38111 EVT VT = N->getValueType(0);
38112 EVT SrcVT = Src.getValueType();
38114 auto IsFreeTruncation = [VT](SDValue Op) {
38115 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
38117 // See if this has been extended from a smaller/equal size to
38118 // the truncation size, allowing a truncation to combine with the extend.
38119 unsigned Opcode = Op.getOpcode();
38120 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
38121 Opcode == ISD::ZERO_EXTEND) &&
38122 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
38125 // See if this is a single use constant which can be constant folded.
38126 SDValue BC = peekThroughOneUseBitcasts(Op);
38127 return ISD::isBuildVectorOfConstantSDNodes(BC.getNode());
38130 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
38131 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
38132 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
38133 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
38136 // Don't combine if the operation has other uses.
38137 if (!Src.hasOneUse())
38140 // Only support vector truncation for now.
38141 // TODO: i64 scalar math would benefit as well.
38142 if (!VT.isVector())
38145 // In most cases its only worth pre-truncating if we're only facing the cost
38146 // of one truncation.
38147 // i.e. if one of the inputs will constant fold or the input is repeated.
38152 SDValue Op0 = Src.getOperand(0);
38153 SDValue Op1 = Src.getOperand(1);
38154 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
38155 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
38156 return TruncateArithmetic(Op0, Op1);
38161 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
38162 // better to truncate if we have the chance.
38163 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
38164 !TLI.isOperationLegal(Opcode, SrcVT))
38165 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
38168 SDValue Op0 = Src.getOperand(0);
38169 SDValue Op1 = Src.getOperand(1);
38170 if (TLI.isOperationLegal(Opcode, VT) &&
38171 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
38172 return TruncateArithmetic(Op0, Op1);
38176 // TODO: ISD::SUB We are conservative and require both sides to be freely
38177 // truncatable to avoid interfering with combineSubToSubus.
38178 SDValue Op0 = Src.getOperand(0);
38179 SDValue Op1 = Src.getOperand(1);
38180 if (TLI.isOperationLegal(Opcode, VT) &&
38181 (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
38182 return TruncateArithmetic(Op0, Op1);
38190 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
38191 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
38192 const X86Subtarget &Subtarget,
38193 SelectionDAG &DAG) {
38194 SDValue In = N->getOperand(0);
38195 EVT InVT = In.getValueType();
38196 EVT InSVT = InVT.getVectorElementType();
38197 EVT OutVT = N->getValueType(0);
38198 EVT OutSVT = OutVT.getVectorElementType();
38200 // Split a long vector into vectors of legal type and mask to unset all bits
38201 // that won't appear in the result to prevent saturation.
38202 // TODO - we should be doing this at the maximum legal size but this is
38203 // causing regressions where we're concatenating back to max width just to
38204 // perform the AND and then extracting back again.....
38205 unsigned NumSubRegs = InVT.getSizeInBits() / 128;
38206 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
38207 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
38208 SmallVector<SDValue, 8> SubVecs(NumSubRegs);
38211 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
38212 SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
38214 for (unsigned i = 0; i < NumSubRegs; i++) {
38215 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
38216 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
38217 SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
38219 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
38221 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
38224 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
38225 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
38226 const X86Subtarget &Subtarget,
38227 SelectionDAG &DAG) {
38228 SDValue In = N->getOperand(0);
38229 EVT InVT = In.getValueType();
38230 EVT OutVT = N->getValueType(0);
38231 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
38232 DAG.getValueType(OutVT));
38233 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
38236 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
38237 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
38238 /// legalization the truncation will be translated into a BUILD_VECTOR with each
38239 /// element that is extracted from a vector and then truncated, and it is
38240 /// difficult to do this optimization based on them.
38241 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
38242 const X86Subtarget &Subtarget) {
38243 EVT OutVT = N->getValueType(0);
38244 if (!OutVT.isVector())
38247 SDValue In = N->getOperand(0);
38248 if (!In.getValueType().isSimple())
38251 EVT InVT = In.getValueType();
38252 unsigned NumElems = OutVT.getVectorNumElements();
38254 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
38255 // SSE2, and we need to take care of it specially.
38256 // AVX512 provides vpmovdb.
38257 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
38260 EVT OutSVT = OutVT.getVectorElementType();
38261 EVT InSVT = InVT.getVectorElementType();
38262 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
38263 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
38267 // SSSE3's pshufb results in less instructions in the cases below.
38268 if (Subtarget.hasSSSE3() && NumElems == 8 &&
38269 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
38270 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
38274 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
38275 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
38276 // truncate 2 x v4i32 to v8i16.
38277 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
38278 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
38279 if (InSVT == MVT::i32)
38280 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
38285 /// This function transforms vector truncation of 'extended sign-bits' or
38286 /// 'extended zero-bits' values.
38287 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
38288 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
38290 const X86Subtarget &Subtarget) {
38291 // Requires SSE2 but AVX512 has fast truncate.
38292 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
38295 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
38298 SDValue In = N->getOperand(0);
38299 if (!In.getValueType().isSimple())
38302 MVT VT = N->getValueType(0).getSimpleVT();
38303 MVT SVT = VT.getScalarType();
38305 MVT InVT = In.getValueType().getSimpleVT();
38306 MVT InSVT = InVT.getScalarType();
38308 // Check we have a truncation suited for PACKSS/PACKUS.
38309 if (!VT.is128BitVector() && !VT.is256BitVector())
38311 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
38313 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
38316 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
38317 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
38319 // Use PACKUS if the input has zero-bits that extend all the way to the
38320 // packed/truncated value. e.g. masks, zext_in_reg, etc.
38321 KnownBits Known = DAG.computeKnownBits(In);
38322 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
38323 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
38324 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
38326 // Use PACKSS if the input has sign-bits that extend all the way to the
38327 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
38328 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
38329 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
38330 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
38335 // Try to form a MULHU or MULHS node by looking for
38336 // (trunc (srl (mul ext, ext), 16))
38337 // TODO: This is X86 specific because we want to be able to handle wide types
38338 // before type legalization. But we can only do it if the vector will be
38339 // legalized via widening/splitting. Type legalization can't handle promotion
38340 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
38342 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
38343 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
38344 // First instruction should be a right shift of a multiply.
38345 if (Src.getOpcode() != ISD::SRL ||
38346 Src.getOperand(0).getOpcode() != ISD::MUL)
38349 if (!Subtarget.hasSSE2())
38352 // Only handle vXi16 types that are at least 128-bits unless they will be
38354 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
38355 (!ExperimentalVectorWideningLegalization &&
38356 VT.getVectorNumElements() < 8))
38359 // Input type should be vXi32.
38360 EVT InVT = Src.getValueType();
38361 if (InVT.getVectorElementType() != MVT::i32)
38364 // Need a shift by 16.
38366 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
38370 SDValue LHS = Src.getOperand(0).getOperand(0);
38371 SDValue RHS = Src.getOperand(0).getOperand(1);
38373 unsigned ExtOpc = LHS.getOpcode();
38374 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
38375 RHS.getOpcode() != ExtOpc)
38378 // Peek through the extends.
38379 LHS = LHS.getOperand(0);
38380 RHS = RHS.getOperand(0);
38382 // Ensure the input types match.
38383 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
38386 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
38387 return DAG.getNode(Opc, DL, VT, LHS, RHS);
38390 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
38391 // from one vector with signed bytes from another vector, adds together
38392 // adjacent pairs of 16-bit products, and saturates the result before
38393 // truncating to 16-bits.
38395 // Which looks something like this:
38396 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
38397 // (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
38398 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
38399 const X86Subtarget &Subtarget,
38401 if (!VT.isVector() || !Subtarget.hasSSSE3())
38404 unsigned NumElems = VT.getVectorNumElements();
38405 EVT ScalarVT = VT.getVectorElementType();
38406 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
38409 SDValue SSatVal = detectSSatPattern(In, VT);
38410 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
38413 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
38414 // of multiplies from even/odd elements.
38415 SDValue N0 = SSatVal.getOperand(0);
38416 SDValue N1 = SSatVal.getOperand(1);
38418 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
38421 SDValue N00 = N0.getOperand(0);
38422 SDValue N01 = N0.getOperand(1);
38423 SDValue N10 = N1.getOperand(0);
38424 SDValue N11 = N1.getOperand(1);
38426 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
38427 // Canonicalize zero_extend to LHS.
38428 if (N01.getOpcode() == ISD::ZERO_EXTEND)
38429 std::swap(N00, N01);
38430 if (N11.getOpcode() == ISD::ZERO_EXTEND)
38431 std::swap(N10, N11);
38433 // Ensure we have a zero_extend and a sign_extend.
38434 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
38435 N01.getOpcode() != ISD::SIGN_EXTEND ||
38436 N10.getOpcode() != ISD::ZERO_EXTEND ||
38437 N11.getOpcode() != ISD::SIGN_EXTEND)
38440 // Peek through the extends.
38441 N00 = N00.getOperand(0);
38442 N01 = N01.getOperand(0);
38443 N10 = N10.getOperand(0);
38444 N11 = N11.getOperand(0);
38446 // Ensure the extend is from vXi8.
38447 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
38448 N01.getValueType().getVectorElementType() != MVT::i8 ||
38449 N10.getValueType().getVectorElementType() != MVT::i8 ||
38450 N11.getValueType().getVectorElementType() != MVT::i8)
38453 // All inputs should be build_vectors.
38454 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
38455 N01.getOpcode() != ISD::BUILD_VECTOR ||
38456 N10.getOpcode() != ISD::BUILD_VECTOR ||
38457 N11.getOpcode() != ISD::BUILD_VECTOR)
38460 // N00/N10 are zero extended. N01/N11 are sign extended.
38462 // For each element, we need to ensure we have an odd element from one vector
38463 // multiplied by the odd element of another vector and the even element from
38464 // one of the same vectors being multiplied by the even element from the
38465 // other vector. So we need to make sure for each element i, this operator
38466 // is being performed:
38467 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
38468 SDValue ZExtIn, SExtIn;
38469 for (unsigned i = 0; i != NumElems; ++i) {
38470 SDValue N00Elt = N00.getOperand(i);
38471 SDValue N01Elt = N01.getOperand(i);
38472 SDValue N10Elt = N10.getOperand(i);
38473 SDValue N11Elt = N11.getOperand(i);
38474 // TODO: Be more tolerant to undefs.
38475 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38476 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38477 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38478 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
38480 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
38481 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
38482 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
38483 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
38484 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
38486 unsigned IdxN00 = ConstN00Elt->getZExtValue();
38487 unsigned IdxN01 = ConstN01Elt->getZExtValue();
38488 unsigned IdxN10 = ConstN10Elt->getZExtValue();
38489 unsigned IdxN11 = ConstN11Elt->getZExtValue();
38490 // Add is commutative so indices can be reordered.
38491 if (IdxN00 > IdxN10) {
38492 std::swap(IdxN00, IdxN10);
38493 std::swap(IdxN01, IdxN11);
38495 // N0 indices be the even element. N1 indices must be the next odd element.
38496 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
38497 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
38499 SDValue N00In = N00Elt.getOperand(0);
38500 SDValue N01In = N01Elt.getOperand(0);
38501 SDValue N10In = N10Elt.getOperand(0);
38502 SDValue N11In = N11Elt.getOperand(0);
38503 // First time we find an input capture it.
38508 if (ZExtIn != N00In || SExtIn != N01In ||
38509 ZExtIn != N10In || SExtIn != N11In)
38513 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38514 ArrayRef<SDValue> Ops) {
38515 // Shrink by adding truncate nodes and let DAGCombine fold with the
38517 EVT InVT = Ops[0].getValueType();
38518 assert(InVT.getScalarType() == MVT::i8 &&
38519 "Unexpected scalar element type");
38520 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
38521 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38522 InVT.getVectorNumElements() / 2);
38523 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
38525 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
38529 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
38530 const X86Subtarget &Subtarget) {
38531 EVT VT = N->getValueType(0);
38532 SDValue Src = N->getOperand(0);
38535 // Attempt to pre-truncate inputs to arithmetic ops instead.
38536 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
38539 // Try to detect AVG pattern first.
38540 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
38543 // Try to detect PMADD
38544 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
38547 // Try to combine truncation with signed/unsigned saturation.
38548 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
38551 // Try to combine PMULHUW/PMULHW for vXi16.
38552 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
38555 // The bitcast source is a direct mmx result.
38556 // Detect bitcasts between i32 to x86mmx
38557 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
38558 SDValue BCSrc = Src.getOperand(0);
38559 if (BCSrc.getValueType() == MVT::x86mmx)
38560 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
38563 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
38564 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
38567 return combineVectorTruncation(N, DAG, Subtarget);
38570 /// Returns the negated value if the node \p N flips sign of FP value.
38572 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
38574 /// AVX512F does not have FXOR, so FNEG is lowered as
38575 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
38576 /// In this case we go though all bitcasts.
38577 /// This also recognizes splat of a negated value and returns the splat of that
38579 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
38580 if (N->getOpcode() == ISD::FNEG)
38581 return N->getOperand(0);
38583 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
38584 auto VT = Op->getValueType(0);
38585 if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
38586 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
38587 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
38588 if (!SVOp->getOperand(1).isUndef())
38590 if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
38591 return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
38595 unsigned Opc = Op.getOpcode();
38596 if (Opc == ISD::INSERT_VECTOR_ELT) {
38597 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
38599 SDValue InsVector = Op.getOperand(0);
38600 SDValue InsVal = Op.getOperand(1);
38601 if (!InsVector.isUndef())
38603 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
38604 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
38605 NegInsVal, Op.getOperand(2));
38609 if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
38612 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
38613 if (!Op1.getValueType().isFloatingPoint())
38616 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
38618 // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
38619 // masks. For FSUB, we have to check if constant bits of Op0 are sign bit
38620 // masks and hence we swap the operands.
38621 if (Opc == ISD::FSUB)
38622 std::swap(Op0, Op1);
38625 SmallVector<APInt, 16> EltBits;
38626 // Extract constant bits and see if they are all sign bit masks. Ignore the
38628 if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(),
38629 UndefElts, EltBits,
38630 /* AllowWholeUndefs */ true,
38631 /* AllowPartialUndefs */ false)) {
38632 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
38633 if (!UndefElts[I] && !EltBits[I].isSignMask())
38636 return peekThroughBitcasts(Op0);
38642 /// Do target-specific dag combines on floating point negations.
38643 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
38644 const X86Subtarget &Subtarget) {
38645 EVT OrigVT = N->getValueType(0);
38646 SDValue Arg = isFNEG(DAG, N);
38650 EVT VT = Arg.getValueType();
38651 EVT SVT = VT.getScalarType();
38654 // Let legalize expand this if it isn't a legal type yet.
38655 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
38658 // If we're negating a FMUL node on a target with FMA, then we can avoid the
38659 // use of a constant by performing (-0 - A*B) instead.
38660 // FIXME: Check rounding control flags as well once it becomes available.
38661 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
38662 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
38663 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
38664 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
38665 Arg.getOperand(1), Zero);
38666 return DAG.getBitcast(OrigVT, NewNode);
38669 // If we're negating an FMA node, then we can adjust the
38670 // instruction to include the extra negation.
38671 unsigned NewOpcode = 0;
38672 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
38673 switch (Arg.getOpcode()) {
38674 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
38675 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
38676 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
38677 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
38678 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
38679 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
38680 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
38681 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
38682 // We can't handle scalar intrinsic node here because it would only
38683 // invert one element and not the whole vector. But we could try to handle
38684 // a negation of the lower element only.
38688 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
38689 Arg.getNode()->ops()));
38694 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
38695 const X86Subtarget &Subtarget) {
38696 MVT VT = N->getSimpleValueType(0);
38697 // If we have integer vector types available, use the integer opcodes.
38698 if (!VT.isVector() || !Subtarget.hasSSE2())
38703 unsigned IntBits = VT.getScalarSizeInBits();
38704 MVT IntSVT = MVT::getIntegerVT(IntBits);
38705 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
38707 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
38708 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
38709 unsigned IntOpcode;
38710 switch (N->getOpcode()) {
38711 default: llvm_unreachable("Unexpected FP logic op");
38712 case X86ISD::FOR: IntOpcode = ISD::OR; break;
38713 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
38714 case X86ISD::FAND: IntOpcode = ISD::AND; break;
38715 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
38717 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
38718 return DAG.getBitcast(VT, IntOp);
38722 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
38723 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
38724 if (N->getOpcode() != ISD::XOR)
38727 SDValue LHS = N->getOperand(0);
38728 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
38729 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
38732 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
38733 X86::CondCode(LHS->getConstantOperandVal(0)));
38735 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
38738 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
38739 TargetLowering::DAGCombinerInfo &DCI,
38740 const X86Subtarget &Subtarget) {
38741 // If this is SSE1 only convert to FXOR to avoid scalarization.
38742 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
38743 N->getValueType(0) == MVT::v4i32) {
38744 return DAG.getBitcast(
38745 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
38746 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
38747 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
38750 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
38753 if (DCI.isBeforeLegalizeOps())
38756 if (SDValue SetCC = foldXor1SetCC(N, DAG))
38759 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
38762 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
38765 return combineFneg(N, DAG, Subtarget);
38768 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
38769 TargetLowering::DAGCombinerInfo &DCI,
38770 const X86Subtarget &Subtarget) {
38771 SDValue Op0 = N->getOperand(0);
38772 SDValue Op1 = N->getOperand(1);
38773 EVT VT = N->getValueType(0);
38774 unsigned NumBits = VT.getSizeInBits();
38776 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38778 // TODO - Constant Folding.
38779 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38780 // Reduce Cst1 to the bottom 16-bits.
38781 // NOTE: SimplifyDemandedBits won't do this for constants.
38782 const APInt &Val1 = Cst1->getAPIntValue();
38783 APInt MaskedVal1 = Val1 & 0xFFFF;
38784 if (MaskedVal1 != Val1)
38785 return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
38786 DAG.getConstant(MaskedVal1, SDLoc(N), VT));
38789 // Only bottom 16-bits of the control bits are required.
38790 APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
38791 if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
38792 return SDValue(N, 0);
38797 static bool isNullFPScalarOrVectorConst(SDValue V) {
38798 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
38801 /// If a value is a scalar FP zero or a vector FP zero (potentially including
38802 /// undefined elements), return a zero constant that may be used to fold away
38803 /// that value. In the case of a vector, the returned constant will not contain
38804 /// undefined elements even if the input parameter does. This makes it suitable
38805 /// to be used as a replacement operand with operations (eg, bitwise-and) where
38806 /// an undef should not propagate.
38807 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
38808 const X86Subtarget &Subtarget) {
38809 if (!isNullFPScalarOrVectorConst(V))
38812 if (V.getValueType().isVector())
38813 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
38818 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
38819 const X86Subtarget &Subtarget) {
38820 SDValue N0 = N->getOperand(0);
38821 SDValue N1 = N->getOperand(1);
38822 EVT VT = N->getValueType(0);
38825 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
38826 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
38827 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
38828 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
38831 auto isAllOnesConstantFP = [](SDValue V) {
38832 if (V.getSimpleValueType().isVector())
38833 return ISD::isBuildVectorAllOnes(V.getNode());
38834 auto *C = dyn_cast<ConstantFPSDNode>(V);
38835 return C && C->getConstantFPValue()->isAllOnesValue();
38838 // fand (fxor X, -1), Y --> fandn X, Y
38839 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
38840 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
38842 // fand X, (fxor Y, -1) --> fandn Y, X
38843 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
38844 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
38849 /// Do target-specific dag combines on X86ISD::FAND nodes.
38850 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
38851 const X86Subtarget &Subtarget) {
38852 // FAND(0.0, x) -> 0.0
38853 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
38856 // FAND(x, 0.0) -> 0.0
38857 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
38860 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
38863 return lowerX86FPLogicOp(N, DAG, Subtarget);
38866 /// Do target-specific dag combines on X86ISD::FANDN nodes.
38867 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
38868 const X86Subtarget &Subtarget) {
38869 // FANDN(0.0, x) -> x
38870 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
38871 return N->getOperand(1);
38873 // FANDN(x, 0.0) -> 0.0
38874 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
38877 return lowerX86FPLogicOp(N, DAG, Subtarget);
38880 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
38881 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
38882 const X86Subtarget &Subtarget) {
38883 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
38885 // F[X]OR(0.0, x) -> x
38886 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
38887 return N->getOperand(1);
38889 // F[X]OR(x, 0.0) -> x
38890 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
38891 return N->getOperand(0);
38893 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
38896 return lowerX86FPLogicOp(N, DAG, Subtarget);
38899 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
38900 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
38901 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
38903 // Only perform optimizations if UnsafeMath is used.
38904 if (!DAG.getTarget().Options.UnsafeFPMath)
38907 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
38908 // into FMINC and FMAXC, which are Commutative operations.
38909 unsigned NewOp = 0;
38910 switch (N->getOpcode()) {
38911 default: llvm_unreachable("unknown opcode");
38912 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
38913 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
38916 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
38917 N->getOperand(0), N->getOperand(1));
38920 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
38921 const X86Subtarget &Subtarget) {
38922 if (Subtarget.useSoftFloat())
38925 // TODO: If an operand is already known to be a NaN or not a NaN, this
38926 // should be an optional swap and FMAX/FMIN.
38928 EVT VT = N->getValueType(0);
38929 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
38930 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
38931 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
38934 SDValue Op0 = N->getOperand(0);
38935 SDValue Op1 = N->getOperand(1);
38937 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
38939 // If we don't have to respect NaN inputs, this is a direct translation to x86
38940 // min/max instructions.
38941 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
38942 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
38944 // If we have to respect NaN inputs, this takes at least 3 instructions.
38945 // Favor a library call when operating on a scalar and minimizing code size.
38946 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
38949 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
38950 DAG.getDataLayout(), *DAG.getContext(), VT);
38952 // There are 4 possibilities involving NaN inputs, and these are the required
38956 // ----------------
38957 // Num | Max | Op0 |
38958 // Op0 ----------------
38959 // NaN | Op1 | NaN |
38960 // ----------------
38962 // The SSE FP max/min instructions were not designed for this case, but rather
38964 // Min = Op1 < Op0 ? Op1 : Op0
38965 // Max = Op1 > Op0 ? Op1 : Op0
38967 // So they always return Op0 if either input is a NaN. However, we can still
38968 // use those instructions for fmaxnum by selecting away a NaN input.
38970 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
38971 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
38972 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
38974 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
38975 // are NaN, the NaN value of Op1 is the result.
38976 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
38979 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
38980 TargetLowering::DAGCombinerInfo &DCI) {
38981 EVT VT = N->getValueType(0);
38982 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38984 APInt KnownUndef, KnownZero;
38985 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
38986 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
38988 return SDValue(N, 0);
38993 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
38994 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
38995 TargetLowering::DAGCombinerInfo &DCI,
38996 const X86Subtarget &Subtarget) {
38997 MVT VT = N->getSimpleValueType(0);
38999 // ANDNP(0, x) -> x
39000 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
39001 return N->getOperand(1);
39003 // ANDNP(x, 0) -> 0
39004 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
39005 return DAG.getConstant(0, SDLoc(N), VT);
39007 // Turn ANDNP back to AND if input is inverted.
39008 if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR &&
39009 ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) {
39010 return DAG.getNode(ISD::AND, SDLoc(N), VT,
39011 N->getOperand(0).getOperand(0), N->getOperand(1));
39014 // Attempt to recursively combine a bitmask ANDNP with shuffles.
39015 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
39017 if (SDValue Res = combineX86ShufflesRecursively(
39018 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
39019 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
39026 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
39027 TargetLowering::DAGCombinerInfo &DCI) {
39028 SDValue N0 = N->getOperand(0);
39029 SDValue N1 = N->getOperand(1);
39031 // BT ignores high bits in the bit index operand.
39032 unsigned BitWidth = N1.getValueSizeInBits();
39033 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
39034 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
39035 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
39040 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
39041 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
39042 EVT VT = N->getValueType(0);
39044 SDValue N0 = N->getOperand(0);
39045 SDValue N1 = N->getOperand(1);
39046 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
39048 if (ExtraVT != MVT::i16)
39051 // Look through single use any_extends.
39052 if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
39053 N0 = N0.getOperand(0);
39055 // See if we have a single use cmov.
39056 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
39059 SDValue CMovOp0 = N0.getOperand(0);
39060 SDValue CMovOp1 = N0.getOperand(1);
39062 // Make sure both operands are constants.
39063 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
39064 !isa<ConstantSDNode>(CMovOp1.getNode()))
39069 // If we looked through an any_extend above, add one to the constants.
39070 if (N0.getValueType() != VT) {
39071 CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
39072 CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
39075 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
39076 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
39078 return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
39079 N0.getOperand(2), N0.getOperand(3));
39082 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
39083 const X86Subtarget &Subtarget) {
39084 if (SDValue V = combineSextInRegCmov(N, DAG))
39087 EVT VT = N->getValueType(0);
39088 SDValue N0 = N->getOperand(0);
39089 SDValue N1 = N->getOperand(1);
39090 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
39093 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
39094 // both SSE and AVX2 since there is no sign-extended shift right
39095 // operation on a vector with 64-bit elements.
39096 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
39097 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
39098 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
39099 N0.getOpcode() == ISD::SIGN_EXTEND)) {
39100 SDValue N00 = N0.getOperand(0);
39102 // EXTLOAD has a better solution on AVX2,
39103 // it may be replaced with X86ISD::VSEXT node.
39104 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
39105 if (!ISD::isNormalLoad(N00.getNode()))
39108 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
39109 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
39111 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
39117 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
39118 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
39119 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
39120 /// opportunities to combine math ops, use an LEA, or use a complex addressing
39121 /// mode. This can eliminate extend, add, and shift instructions.
39122 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
39123 const X86Subtarget &Subtarget) {
39124 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
39125 Ext->getOpcode() != ISD::ZERO_EXTEND)
39128 // TODO: This should be valid for other integer types.
39129 EVT VT = Ext->getValueType(0);
39130 if (VT != MVT::i64)
39133 SDValue Add = Ext->getOperand(0);
39134 if (Add.getOpcode() != ISD::ADD)
39137 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
39138 bool NSW = Add->getFlags().hasNoSignedWrap();
39139 bool NUW = Add->getFlags().hasNoUnsignedWrap();
39141 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
39143 if ((Sext && !NSW) || (!Sext && !NUW))
39146 // Having a constant operand to the 'add' ensures that we are not increasing
39147 // the instruction count because the constant is extended for free below.
39148 // A constant operand can also become the displacement field of an LEA.
39149 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
39153 // Don't make the 'add' bigger if there's no hope of combining it with some
39154 // other 'add' or 'shl' instruction.
39155 // TODO: It may be profitable to generate simpler LEA instructions in place
39156 // of single 'add' instructions, but the cost model for selecting an LEA
39157 // currently has a high threshold.
39158 bool HasLEAPotential = false;
39159 for (auto *User : Ext->uses()) {
39160 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
39161 HasLEAPotential = true;
39165 if (!HasLEAPotential)
39168 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
39169 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
39170 SDValue AddOp0 = Add.getOperand(0);
39171 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
39172 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
39174 // The wider add is guaranteed to not wrap because both operands are
39177 Flags.setNoSignedWrap(NSW);
39178 Flags.setNoUnsignedWrap(NUW);
39179 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
39182 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
39183 // operands and the result of CMOV is not used anywhere else - promote CMOV
39184 // itself instead of promoting its result. This could be beneficial, because:
39185 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
39186 // (or more) pseudo-CMOVs only when they go one-after-another and
39187 // getting rid of result extension code after CMOV will help that.
39188 // 2) Promotion of constant CMOV arguments is free, hence the
39189 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
39190 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
39191 // promotion is also good in terms of code-size.
39192 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
39194 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
39195 SDValue CMovN = Extend->getOperand(0);
39196 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
39199 EVT TargetVT = Extend->getValueType(0);
39200 unsigned ExtendOpcode = Extend->getOpcode();
39203 EVT VT = CMovN.getValueType();
39204 SDValue CMovOp0 = CMovN.getOperand(0);
39205 SDValue CMovOp1 = CMovN.getOperand(1);
39207 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
39208 !isa<ConstantSDNode>(CMovOp1.getNode()))
39211 // Only extend to i32 or i64.
39212 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
39215 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
39217 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
39220 // If this a zero extend to i64, we should only extend to i32 and use a free
39221 // zero extend to finish.
39222 EVT ExtendVT = TargetVT;
39223 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
39224 ExtendVT = MVT::i32;
39226 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
39227 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
39229 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
39230 CMovN.getOperand(2), CMovN.getOperand(3));
39232 // Finish extending if needed.
39233 if (ExtendVT != TargetVT)
39234 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
39239 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
39240 // This is more or less the reverse of combineBitcastvxi1.
39242 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
39243 TargetLowering::DAGCombinerInfo &DCI,
39244 const X86Subtarget &Subtarget) {
39245 unsigned Opcode = N->getOpcode();
39246 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
39247 Opcode != ISD::ANY_EXTEND)
39249 if (!DCI.isBeforeLegalizeOps())
39251 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
39254 SDValue N0 = N->getOperand(0);
39255 EVT VT = N->getValueType(0);
39256 EVT SVT = VT.getScalarType();
39257 EVT InSVT = N0.getValueType().getScalarType();
39258 unsigned EltSizeInBits = SVT.getSizeInBits();
39260 // Input type must be extending a bool vector (bit-casted from a scalar
39261 // integer) to legal integer types.
39262 if (!VT.isVector())
39264 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
39266 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
39269 SDValue N00 = N0.getOperand(0);
39270 EVT SclVT = N0.getOperand(0).getValueType();
39271 if (!SclVT.isScalarInteger())
39276 SmallVector<int, 32> ShuffleMask;
39277 unsigned NumElts = VT.getVectorNumElements();
39278 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
39280 // Broadcast the scalar integer to the vector elements.
39281 if (NumElts > EltSizeInBits) {
39282 // If the scalar integer is greater than the vector element size, then we
39283 // must split it down into sub-sections for broadcasting. For example:
39284 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
39285 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
39286 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
39287 unsigned Scale = NumElts / EltSizeInBits;
39289 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
39290 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
39291 Vec = DAG.getBitcast(VT, Vec);
39293 for (unsigned i = 0; i != Scale; ++i)
39294 ShuffleMask.append(EltSizeInBits, i);
39296 // For smaller scalar integers, we can simply any-extend it to the vector
39297 // element size (we don't care about the upper bits) and broadcast it to all
39299 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
39300 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
39301 ShuffleMask.append(NumElts, 0);
39303 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
39305 // Now, mask the relevant bit in each element.
39306 SmallVector<SDValue, 32> Bits;
39307 for (unsigned i = 0; i != NumElts; ++i) {
39308 int BitIdx = (i % EltSizeInBits);
39309 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
39310 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
39312 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
39313 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
39315 // Compare against the bitmask and extend the result.
39316 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
39317 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
39318 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
39320 // For SEXT, this is now done, otherwise shift the result down for
39322 if (Opcode == ISD::SIGN_EXTEND)
39324 return DAG.getNode(ISD::SRL, DL, VT, Vec,
39325 DAG.getConstant(EltSizeInBits - 1, DL, VT));
39328 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
39329 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
39330 /// with UNDEFs) of the input to vectors of the same size as the target type
39331 /// which then extends the lowest elements.
39332 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
39333 TargetLowering::DAGCombinerInfo &DCI,
39334 const X86Subtarget &Subtarget) {
39335 if (ExperimentalVectorWideningLegalization)
39338 unsigned Opcode = N->getOpcode();
39339 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
39341 if (!DCI.isBeforeLegalizeOps())
39343 if (!Subtarget.hasSSE2())
39346 SDValue N0 = N->getOperand(0);
39347 EVT VT = N->getValueType(0);
39348 EVT SVT = VT.getScalarType();
39349 EVT InVT = N0.getValueType();
39350 EVT InSVT = InVT.getScalarType();
39352 // FIXME: Generic DAGCombiner previously had a bug that would cause a
39353 // sign_extend of setcc to sometimes return the original node and tricked it
39354 // into thinking CombineTo was used which prevented the target combines from
39356 // Earlying out here to avoid regressions like this
39357 // (v4i32 (sext (v4i1 (setcc (v4i16)))))
39359 // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
39360 // Type legalized to
39361 // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
39362 // Leading to a packssdw+pmovsxwd
39363 // We could write a DAG combine to fix this, but really we shouldn't be
39364 // creating sext_invec that's forcing v8i16 into the DAG.
39365 if (N0.getOpcode() == ISD::SETCC)
39368 // Input type must be a vector and we must be extending legal integer types.
39369 if (!VT.isVector() || VT.getVectorNumElements() < 2)
39371 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
39373 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
39376 // If the input/output types are both legal then we have at least AVX1 and
39377 // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
39378 if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
39379 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
39384 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
39385 EVT InVT = N.getValueType();
39386 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
39387 Size / InVT.getScalarSizeInBits());
39388 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
39389 DAG.getUNDEF(InVT));
39391 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
39394 // If target-size is less than 128-bits, extend to a type that would extend
39395 // to 128 bits, extend that and extract the original target vector.
39396 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
39397 unsigned Scale = 128 / VT.getSizeInBits();
39399 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
39400 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
39401 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
39402 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
39403 DAG.getIntPtrConstant(0, DL));
39406 // If target-size is 128-bits (or 256-bits on AVX target), then convert to
39407 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
39408 // Also use this if we don't have SSE41 to allow the legalizer do its job.
39409 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
39410 (VT.is256BitVector() && Subtarget.hasAVX()) ||
39411 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
39412 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
39413 Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
39414 : ISD::ZERO_EXTEND_VECTOR_INREG;
39415 return DAG.getNode(Opcode, DL, VT, ExOp);
39418 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
39419 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
39420 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
39421 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
39422 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
39424 unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
39425 : ISD::ZERO_EXTEND_VECTOR_INREG;
39427 SmallVector<SDValue, 8> Opnds;
39428 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
39429 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
39430 DAG.getIntPtrConstant(Offset, DL));
39431 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
39432 SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
39433 Opnds.push_back(SrcVec);
39435 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
39438 // On pre-AVX targets, split into 128-bit nodes of
39439 // ISD::*_EXTEND_VECTOR_INREG.
39440 if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
39441 return SplitAndExtendInReg(128);
39443 // On pre-AVX512 targets, split into 256-bit nodes of
39444 // ISD::*_EXTEND_VECTOR_INREG.
39445 if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
39446 return SplitAndExtendInReg(256);
39451 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
39453 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
39454 const X86Subtarget &Subtarget) {
39455 SDValue N0 = N->getOperand(0);
39456 EVT VT = N->getValueType(0);
39459 // Only do this combine with AVX512 for vector extends.
39460 if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
39463 // Only combine legal element types.
39464 EVT SVT = VT.getVectorElementType();
39465 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
39466 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
39469 // We can only do this if the vector size in 256 bits or less.
39470 unsigned Size = VT.getSizeInBits();
39474 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
39475 // that's the only integer compares with we have.
39476 ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
39477 if (ISD::isUnsignedIntSetCC(CC))
39480 // Only do this combine if the extension will be fully consumed by the setcc.
39481 EVT N00VT = N0.getOperand(0).getValueType();
39482 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
39483 if (Size != MatchingVecType.getSizeInBits())
39486 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
39488 if (N->getOpcode() == ISD::ZERO_EXTEND)
39489 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
39494 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
39495 TargetLowering::DAGCombinerInfo &DCI,
39496 const X86Subtarget &Subtarget) {
39497 SDValue N0 = N->getOperand(0);
39498 EVT VT = N->getValueType(0);
39499 EVT InVT = N0.getValueType();
39502 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
39505 if (!DCI.isBeforeLegalizeOps())
39508 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
39511 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
39512 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
39513 // Invert and sign-extend a boolean is the same as zero-extend and subtract
39514 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
39515 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
39516 // sext (xor Bool, -1) --> sub (zext Bool), 1
39517 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
39518 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
39521 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
39524 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
39528 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
39531 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
39537 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
39540 default: llvm_unreachable("Unexpected opcode");
39541 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
39542 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
39543 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
39544 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
39545 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
39546 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
39547 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
39548 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
39554 default: llvm_unreachable("Unexpected opcode");
39555 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
39556 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
39557 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
39558 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
39559 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
39560 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
39561 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
39562 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
39569 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
39570 const X86Subtarget &Subtarget) {
39572 EVT VT = N->getValueType(0);
39574 // Let legalize expand this if it isn't a legal type yet.
39575 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
39578 EVT ScalarVT = VT.getScalarType();
39579 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
39582 SDValue A = N->getOperand(0);
39583 SDValue B = N->getOperand(1);
39584 SDValue C = N->getOperand(2);
39586 auto invertIfNegative = [&DAG](SDValue &V) {
39587 if (SDValue NegVal = isFNEG(DAG, V.getNode())) {
39588 V = DAG.getBitcast(V.getValueType(), NegVal);
39591 // Look through extract_vector_elts. If it comes from an FNEG, create a
39592 // new extract from the FNEG input.
39593 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39594 isNullConstant(V.getOperand(1))) {
39595 if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) {
39596 NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
39597 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
39598 NegVal, V.getOperand(1));
39606 // Do not convert the passthru input of scalar intrinsics.
39607 // FIXME: We could allow negations of the lower element only.
39608 bool NegA = invertIfNegative(A);
39609 bool NegB = invertIfNegative(B);
39610 bool NegC = invertIfNegative(C);
39612 if (!NegA && !NegB && !NegC)
39615 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
39617 if (N->getNumOperands() == 4)
39618 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
39619 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
39622 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
39623 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
39624 const X86Subtarget &Subtarget) {
39626 EVT VT = N->getValueType(0);
39628 SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode());
39632 unsigned NewOpcode;
39633 switch (N->getOpcode()) {
39634 default: llvm_unreachable("Unexpected opcode!");
39635 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
39636 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
39637 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
39638 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
39641 if (N->getNumOperands() == 4)
39642 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
39643 NegVal, N->getOperand(3));
39644 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
39648 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
39649 TargetLowering::DAGCombinerInfo &DCI,
39650 const X86Subtarget &Subtarget) {
39651 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
39652 // (and (i32 x86isd::setcc_carry), 1)
39653 // This eliminates the zext. This transformation is necessary because
39654 // ISD::SETCC is always legalized to i8.
39656 SDValue N0 = N->getOperand(0);
39657 EVT VT = N->getValueType(0);
39659 if (N0.getOpcode() == ISD::AND &&
39661 N0.getOperand(0).hasOneUse()) {
39662 SDValue N00 = N0.getOperand(0);
39663 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
39664 if (!isOneConstant(N0.getOperand(1)))
39666 return DAG.getNode(ISD::AND, dl, VT,
39667 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
39668 N00.getOperand(0), N00.getOperand(1)),
39669 DAG.getConstant(1, dl, VT));
39673 if (N0.getOpcode() == ISD::TRUNCATE &&
39675 N0.getOperand(0).hasOneUse()) {
39676 SDValue N00 = N0.getOperand(0);
39677 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
39678 return DAG.getNode(ISD::AND, dl, VT,
39679 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
39680 N00.getOperand(0), N00.getOperand(1)),
39681 DAG.getConstant(1, dl, VT));
39685 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
39688 if (DCI.isBeforeLegalizeOps())
39689 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
39692 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
39695 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
39699 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
39702 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
39705 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
39711 /// Try to map a 128-bit or larger integer comparison to vector instructions
39712 /// before type legalization splits it up into chunks.
39713 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
39714 const X86Subtarget &Subtarget) {
39715 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
39716 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
39718 // We're looking for an oversized integer equality comparison.
39719 SDValue X = SetCC->getOperand(0);
39720 SDValue Y = SetCC->getOperand(1);
39721 EVT OpVT = X.getValueType();
39722 unsigned OpSize = OpVT.getSizeInBits();
39723 if (!OpVT.isScalarInteger() || OpSize < 128)
39726 // Ignore a comparison with zero because that gets special treatment in
39727 // EmitTest(). But make an exception for the special case of a pair of
39728 // logically-combined vector-sized operands compared to zero. This pattern may
39729 // be generated by the memcmp expansion pass with oversized integer compares
39731 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
39732 X.getOperand(0).getOpcode() == ISD::XOR &&
39733 X.getOperand(1).getOpcode() == ISD::XOR;
39734 if (isNullConstant(Y) && !IsOrXorXorCCZero)
39737 // Bail out if we know that this is not really just an oversized integer.
39738 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
39739 peekThroughBitcasts(Y).getValueType() == MVT::f128)
39742 // TODO: Use PXOR + PTEST for SSE4.1 or later?
39743 EVT VT = SetCC->getValueType(0);
39745 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
39746 (OpSize == 256 && Subtarget.hasAVX2()) ||
39747 (OpSize == 512 && Subtarget.useAVX512Regs())) {
39748 EVT VecVT = OpSize == 512 ? MVT::v16i32 :
39749 OpSize == 256 ? MVT::v32i8 :
39751 EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
39753 if (IsOrXorXorCCZero) {
39754 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
39755 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
39756 // Use 2 vector equality compares and 'and' the results before doing a
39758 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
39759 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
39760 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
39761 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
39762 SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
39763 SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
39764 Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
39766 SDValue VecX = DAG.getBitcast(VecVT, X);
39767 SDValue VecY = DAG.getBitcast(VecVT, Y);
39768 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
39770 // For 512-bits we want to emit a setcc that will lower to kortest.
39772 return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
39773 DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
39774 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
39775 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
39776 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
39777 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
39778 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
39779 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
39780 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
39782 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
39788 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
39789 const X86Subtarget &Subtarget) {
39790 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
39791 SDValue LHS = N->getOperand(0);
39792 SDValue RHS = N->getOperand(1);
39793 EVT VT = N->getValueType(0);
39794 EVT OpVT = LHS.getValueType();
39797 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
39798 // 0-x == y --> x+y == 0
39799 // 0-x != y --> x+y != 0
39800 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
39802 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
39803 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
39805 // x == 0-y --> x+y == 0
39806 // x != 0-y --> x+y != 0
39807 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
39809 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
39810 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
39813 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
39817 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
39818 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
39819 // Put build_vectors on the right.
39820 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
39821 std::swap(LHS, RHS);
39822 CC = ISD::getSetCCSwappedOperands(CC);
39826 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
39827 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
39828 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
39830 if (IsSEXT0 && IsVZero1) {
39831 assert(VT == LHS.getOperand(0).getValueType() &&
39832 "Uexpected operand type");
39833 if (CC == ISD::SETGT)
39834 return DAG.getConstant(0, DL, VT);
39835 if (CC == ISD::SETLE)
39836 return DAG.getConstant(1, DL, VT);
39837 if (CC == ISD::SETEQ || CC == ISD::SETGE)
39838 return DAG.getNOT(DL, LHS.getOperand(0), VT);
39840 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
39841 "Unexpected condition code!");
39842 return LHS.getOperand(0);
39846 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
39847 // pre-promote its result type since vXi1 vectors don't get promoted
39848 // during type legalization.
39849 // NOTE: The element count check is to ignore operand types that need to
39850 // go through type promotion to a 128-bit vector.
39851 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
39852 VT.getVectorElementType() == MVT::i1 &&
39853 (ExperimentalVectorWideningLegalization ||
39854 VT.getVectorNumElements() > 4) &&
39855 (OpVT.getVectorElementType() == MVT::i8 ||
39856 OpVT.getVectorElementType() == MVT::i16)) {
39857 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
39859 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
39862 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
39863 // to avoid scalarization via legalization because v4i32 is not a legal type.
39864 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
39865 LHS.getValueType() == MVT::v4f32)
39866 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
39871 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
39872 TargetLowering::DAGCombinerInfo &DCI) {
39873 SDValue Src = N->getOperand(0);
39874 MVT SrcVT = Src.getSimpleValueType();
39875 MVT VT = N->getSimpleValueType(0);
39877 // Perform constant folding.
39878 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
39879 assert(VT== MVT::i32 && "Unexpected result type");
39881 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
39882 SDValue In = Src.getOperand(Idx);
39883 if (!In.isUndef() &&
39884 cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
39887 return DAG.getConstant(Imm, SDLoc(N), VT);
39890 // Look through int->fp bitcasts that don't change the element width.
39891 if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse() &&
39892 SrcVT.isFloatingPoint() &&
39893 Src.getOperand(0).getValueType() ==
39894 EVT(SrcVT).changeVectorElementTypeToInteger())
39895 Src = Src.getOperand(0);
39897 // Simplify the inputs.
39898 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39899 APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
39900 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
39901 return SDValue(N, 0);
39903 // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)).
39904 // Only do this when the setcc input and output types are the same and the
39905 // setcc and the 'and' node have a single use.
39906 // FIXME: Support 256-bits with AVX1. The movmsk is split, but the and isn't.
39908 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
39909 Src.getOperand(0).getValueType() == Src.getValueType() &&
39910 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETNE &&
39911 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
39912 Src.getOperand(0).getOpcode() == ISD::AND) {
39913 SDValue And = Src.getOperand(0);
39914 if (And.hasOneUse() &&
39915 ISD::isConstantSplatVector(And.getOperand(1).getNode(), SplatVal) &&
39916 SplatVal.isPowerOf2()) {
39917 MVT VT = Src.getSimpleValueType();
39918 unsigned BitWidth = VT.getScalarSizeInBits();
39919 unsigned ShAmt = BitWidth - SplatVal.logBase2() - 1;
39921 SDValue X = And.getOperand(0);
39922 // If the element type is i8, we need to bitcast to i16 to use a legal
39923 // shift. If we wait until lowering we end up with an extra and to bits
39924 // from crossing the 8-bit elements, but we don't care about that here.
39925 if (VT.getVectorElementType() == MVT::i8) {
39926 VT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
39927 X = DAG.getBitcast(VT, X);
39929 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
39930 DAG.getConstant(ShAmt, DL, VT));
39931 SDValue Cast = DAG.getBitcast(SrcVT, Shl);
39932 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), N->getValueType(0), Cast);
39939 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
39940 TargetLowering::DAGCombinerInfo &DCI,
39941 const X86Subtarget &Subtarget) {
39944 if (DCI.isBeforeLegalizeOps()) {
39945 SDValue Index = N->getOperand(4);
39946 // Remove any sign extends from 32 or smaller to larger than 32.
39947 // Only do this before LegalizeOps in case we need the sign extend for
39949 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
39950 if (Index.getScalarValueSizeInBits() > 32 &&
39951 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
39952 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
39953 NewOps[4] = Index.getOperand(0);
39954 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
39956 // The original sign extend has less users, add back to worklist in
39957 // case it needs to be removed
39958 DCI.AddToWorklist(Index.getNode());
39959 DCI.AddToWorklist(N);
39961 return SDValue(Res, 0);
39965 // Make sure the index is either i32 or i64
39966 unsigned ScalarSize = Index.getScalarValueSizeInBits();
39967 if (ScalarSize != 32 && ScalarSize != 64) {
39968 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
39969 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
39970 Index.getValueType().getVectorNumElements());
39971 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
39972 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
39974 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
39976 DCI.AddToWorklist(N);
39977 return SDValue(Res, 0);
39980 // Try to remove zero extends from 32->64 if we know the sign bit of
39981 // the input is zero.
39982 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
39983 Index.getScalarValueSizeInBits() == 64 &&
39984 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
39985 if (DAG.SignBitIsZero(Index.getOperand(0))) {
39986 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
39987 NewOps[4] = Index.getOperand(0);
39988 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
39990 // The original sign extend has less users, add back to worklist in
39991 // case it needs to be removed
39992 DCI.AddToWorklist(Index.getNode());
39993 DCI.AddToWorklist(N);
39995 return SDValue(Res, 0);
40000 // With AVX2 we only demand the upper bit of the mask.
40001 if (!Subtarget.hasAVX512()) {
40002 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40003 SDValue Mask = N->getOperand(2);
40004 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
40005 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
40006 return SDValue(N, 0);
40012 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
40013 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
40014 const X86Subtarget &Subtarget) {
40016 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
40017 SDValue EFLAGS = N->getOperand(1);
40019 // Try to simplify the EFLAGS and condition code operands.
40020 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
40021 return getSETCC(CC, Flags, DL, DAG);
40026 /// Optimize branch condition evaluation.
40027 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
40028 const X86Subtarget &Subtarget) {
40030 SDValue EFLAGS = N->getOperand(3);
40031 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
40033 // Try to simplify the EFLAGS and condition code operands.
40034 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
40035 // RAUW them under us.
40036 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
40037 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
40038 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
40039 N->getOperand(1), Cond, Flags);
40045 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
40046 SelectionDAG &DAG) {
40047 // Take advantage of vector comparisons producing 0 or -1 in each lane to
40048 // optimize away operation when it's from a constant.
40050 // The general transformation is:
40051 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
40052 // AND(VECTOR_CMP(x,y), constant2)
40053 // constant2 = UNARYOP(constant)
40055 // Early exit if this isn't a vector operation, the operand of the
40056 // unary operation isn't a bitwise AND, or if the sizes of the operations
40057 // aren't the same.
40058 EVT VT = N->getValueType(0);
40059 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
40060 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
40061 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
40064 // Now check that the other operand of the AND is a constant. We could
40065 // make the transformation for non-constant splats as well, but it's unclear
40066 // that would be a benefit as it would not eliminate any operations, just
40067 // perform one more step in scalar code before moving to the vector unit.
40068 if (BuildVectorSDNode *BV =
40069 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
40070 // Bail out if the vector isn't a constant.
40071 if (!BV->isConstant())
40074 // Everything checks out. Build up the new and improved node.
40076 EVT IntVT = BV->getValueType(0);
40077 // Create a new constant of the appropriate type for the transformed
40079 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
40080 // The AND node needs bitcasts to/from an integer vector type around it.
40081 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
40082 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
40083 N->getOperand(0)->getOperand(0), MaskConst);
40084 SDValue Res = DAG.getBitcast(VT, NewAnd);
40091 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
40092 const X86Subtarget &Subtarget) {
40093 SDValue Op0 = N->getOperand(0);
40094 EVT VT = N->getValueType(0);
40095 EVT InVT = Op0.getValueType();
40097 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
40098 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
40099 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
40100 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
40102 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
40103 InVT.getVectorNumElements());
40104 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
40106 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
40107 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
40110 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
40111 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
40112 // the optimization here.
40113 if (DAG.SignBitIsZero(Op0))
40114 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
40119 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
40120 const X86Subtarget &Subtarget) {
40121 // First try to optimize away the conversion entirely when it's
40122 // conditionally from a constant. Vectors only.
40123 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
40126 // Now move on to more general possibilities.
40127 SDValue Op0 = N->getOperand(0);
40128 EVT VT = N->getValueType(0);
40129 EVT InVT = Op0.getValueType();
40131 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
40132 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
40133 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
40134 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
40136 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
40137 InVT.getVectorNumElements());
40138 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
40139 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
40142 // Without AVX512DQ we only support i64 to float scalar conversion. For both
40143 // vectors and scalars, see if we know that the upper bits are all the sign
40144 // bit, in which case we can truncate the input to i32 and convert from that.
40145 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
40146 unsigned BitWidth = InVT.getScalarSizeInBits();
40147 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
40148 if (NumSignBits >= (BitWidth - 31)) {
40149 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
40150 if (InVT.isVector())
40151 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
40152 InVT.getVectorNumElements());
40154 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
40155 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
40159 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
40160 // a 32-bit target where SSE doesn't support i64->FP operations.
40161 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
40162 Op0.getOpcode() == ISD::LOAD) {
40163 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
40164 EVT LdVT = Ld->getValueType(0);
40166 // This transformation is not supported if the result type is f16 or f128.
40167 if (VT == MVT::f16 || VT == MVT::f128)
40170 // If we have AVX512DQ we can use packed conversion instructions unless
40172 if (Subtarget.hasDQI() && VT != MVT::f80)
40175 if (!Ld->isVolatile() && !VT.isVector() &&
40176 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
40177 !Subtarget.is64Bit() && LdVT == MVT::i64) {
40178 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
40179 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
40180 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
40187 static bool needCarryOrOverflowFlag(SDValue Flags) {
40188 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
40190 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
40192 SDNode *User = *UI;
40195 switch (User->getOpcode()) {
40197 // Be conservative.
40199 case X86ISD::SETCC:
40200 case X86ISD::SETCC_CARRY:
40201 CC = (X86::CondCode)User->getConstantOperandVal(0);
40203 case X86ISD::BRCOND:
40204 CC = (X86::CondCode)User->getConstantOperandVal(2);
40207 CC = (X86::CondCode)User->getConstantOperandVal(2);
40213 case X86::COND_A: case X86::COND_AE:
40214 case X86::COND_B: case X86::COND_BE:
40215 case X86::COND_O: case X86::COND_NO:
40216 case X86::COND_G: case X86::COND_GE:
40217 case X86::COND_L: case X86::COND_LE:
40225 static bool onlyZeroFlagUsed(SDValue Flags) {
40226 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
40228 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
40230 SDNode *User = *UI;
40233 switch (User->getOpcode()) {
40235 // Be conservative.
40237 case X86ISD::SETCC: CCOpNo = 0; break;
40238 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
40239 case X86ISD::BRCOND: CCOpNo = 2; break;
40240 case X86ISD::CMOV: CCOpNo = 2; break;
40243 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
40244 if (CC != X86::COND_E && CC != X86::COND_NE)
40251 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
40252 // Only handle test patterns.
40253 if (!isNullConstant(N->getOperand(1)))
40256 // If we have a CMP of a truncated binop, see if we can make a smaller binop
40257 // and use its flags directly.
40258 // TODO: Maybe we should try promoting compares that only use the zero flag
40259 // first if we can prove the upper bits with computeKnownBits?
40261 SDValue Op = N->getOperand(0);
40262 EVT VT = Op.getValueType();
40264 // If we have a constant logical shift that's only used in a comparison
40265 // against zero turn it into an equivalent AND. This allows turning it into
40266 // a TEST instruction later.
40267 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
40268 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
40269 onlyZeroFlagUsed(SDValue(N, 0))) {
40270 EVT VT = Op.getValueType();
40271 unsigned BitWidth = VT.getSizeInBits();
40272 unsigned ShAmt = Op.getConstantOperandVal(1);
40273 if (ShAmt < BitWidth) { // Avoid undefined shifts.
40274 APInt Mask = Op.getOpcode() == ISD::SRL
40275 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
40276 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
40277 if (Mask.isSignedIntN(32)) {
40278 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
40279 DAG.getConstant(Mask, dl, VT));
40280 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
40281 DAG.getConstant(0, dl, VT));
40287 // Look for a truncate with a single use.
40288 if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
40291 Op = Op.getOperand(0);
40293 // Arithmetic op can only have one use.
40294 if (!Op.hasOneUse())
40298 switch (Op.getOpcode()) {
40299 default: return SDValue();
40301 // Skip and with constant. We have special handling for and with immediate
40302 // during isel to generate test instructions.
40303 if (isa<ConstantSDNode>(Op.getOperand(1)))
40305 NewOpc = X86ISD::AND;
40307 case ISD::OR: NewOpc = X86ISD::OR; break;
40308 case ISD::XOR: NewOpc = X86ISD::XOR; break;
40310 // If the carry or overflow flag is used, we can't truncate.
40311 if (needCarryOrOverflowFlag(SDValue(N, 0)))
40313 NewOpc = X86ISD::ADD;
40316 // If the carry or overflow flag is used, we can't truncate.
40317 if (needCarryOrOverflowFlag(SDValue(N, 0)))
40319 NewOpc = X86ISD::SUB;
40323 // We found an op we can narrow. Truncate its inputs.
40324 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
40325 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
40327 // Use a X86 specific opcode to avoid DAG combine messing with it.
40328 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
40329 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
40331 // For AND, keep a CMP so that we can match the test pattern.
40332 if (NewOpc == X86ISD::AND)
40333 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
40334 DAG.getConstant(0, dl, VT));
40336 // Return the flags.
40337 return Op.getValue(1);
40340 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
40341 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
40342 MVT VT = N->getSimpleValueType(0);
40343 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
40344 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
40345 N->getOperand(0), N->getOperand(1),
40352 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
40353 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
40354 TargetLowering::DAGCombinerInfo &DCI) {
40355 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
40356 // the result is either zero or one (depending on the input carry bit).
40357 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
40358 if (X86::isZeroNode(N->getOperand(0)) &&
40359 X86::isZeroNode(N->getOperand(1)) &&
40360 // We don't have a good way to replace an EFLAGS use, so only do this when
40362 SDValue(N, 1).use_empty()) {
40364 EVT VT = N->getValueType(0);
40365 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
40366 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
40367 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
40368 DAG.getConstant(X86::COND_B, DL,
40371 DAG.getConstant(1, DL, VT));
40372 return DCI.CombineTo(N, Res1, CarryOut);
40375 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
40376 MVT VT = N->getSimpleValueType(0);
40377 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
40378 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
40379 N->getOperand(0), N->getOperand(1),
40386 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
40387 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
40388 /// with CMP+{ADC, SBB}.
40389 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
40390 bool IsSub = N->getOpcode() == ISD::SUB;
40391 SDValue X = N->getOperand(0);
40392 SDValue Y = N->getOperand(1);
40394 // If this is an add, canonicalize a zext operand to the RHS.
40395 // TODO: Incomplete? What if both sides are zexts?
40396 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
40397 Y.getOpcode() != ISD::ZERO_EXTEND)
40400 // Look through a one-use zext.
40401 bool PeekedThroughZext = false;
40402 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
40403 Y = Y.getOperand(0);
40404 PeekedThroughZext = true;
40407 // If this is an add, canonicalize a setcc operand to the RHS.
40408 // TODO: Incomplete? What if both sides are setcc?
40409 // TODO: Should we allow peeking through a zext of the other operand?
40410 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
40411 Y.getOpcode() != X86ISD::SETCC)
40414 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
40418 EVT VT = N->getValueType(0);
40419 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
40421 // If X is -1 or 0, then we have an opportunity to avoid constants required in
40422 // the general case below.
40423 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
40425 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
40426 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
40427 // This is a complicated way to get -1 or 0 from the carry flag:
40428 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
40429 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
40430 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
40431 DAG.getConstant(X86::COND_B, DL, MVT::i8),
40435 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
40436 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
40437 SDValue EFLAGS = Y->getOperand(1);
40438 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
40439 EFLAGS.getValueType().isInteger() &&
40440 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
40441 // Swap the operands of a SUB, and we have the same pattern as above.
40442 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
40443 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
40444 SDValue NewSub = DAG.getNode(
40445 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
40446 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
40447 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
40448 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
40449 DAG.getConstant(X86::COND_B, DL, MVT::i8),
40455 if (CC == X86::COND_B) {
40456 // X + SETB Z --> adc X, 0
40457 // X - SETB Z --> sbb X, 0
40458 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
40459 DAG.getVTList(VT, MVT::i32), X,
40460 DAG.getConstant(0, DL, VT), Y.getOperand(1));
40463 if (CC == X86::COND_A) {
40464 SDValue EFLAGS = Y->getOperand(1);
40465 // Try to convert COND_A into COND_B in an attempt to facilitate
40466 // materializing "setb reg".
40468 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
40469 // cannot take an immediate as its first operand.
40471 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
40472 EFLAGS.getValueType().isInteger() &&
40473 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
40474 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
40475 EFLAGS.getNode()->getVTList(),
40476 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
40477 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
40478 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
40479 DAG.getVTList(VT, MVT::i32), X,
40480 DAG.getConstant(0, DL, VT), NewEFLAGS);
40484 if (CC != X86::COND_E && CC != X86::COND_NE)
40487 SDValue Cmp = Y.getOperand(1);
40488 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
40489 !X86::isZeroNode(Cmp.getOperand(1)) ||
40490 !Cmp.getOperand(0).getValueType().isInteger())
40493 SDValue Z = Cmp.getOperand(0);
40494 EVT ZVT = Z.getValueType();
40496 // If X is -1 or 0, then we have an opportunity to avoid constants required in
40497 // the general case below.
40499 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
40501 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
40502 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
40503 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
40504 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
40505 SDValue Zero = DAG.getConstant(0, DL, ZVT);
40506 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
40507 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
40508 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
40509 DAG.getConstant(X86::COND_B, DL, MVT::i8),
40510 SDValue(Neg.getNode(), 1));
40513 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
40514 // with fake operands:
40515 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
40516 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
40517 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
40518 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
40519 SDValue One = DAG.getConstant(1, DL, ZVT);
40520 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
40521 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
40522 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
40526 // (cmp Z, 1) sets the carry flag if Z is 0.
40527 SDValue One = DAG.getConstant(1, DL, ZVT);
40528 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
40530 // Add the flags type for ADC/SBB nodes.
40531 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
40533 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
40534 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
40535 if (CC == X86::COND_NE)
40536 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
40537 DAG.getConstant(-1ULL, DL, VT), Cmp1);
40539 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
40540 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
40541 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
40542 DAG.getConstant(0, DL, VT), Cmp1);
40545 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
40546 const X86Subtarget &Subtarget) {
40547 if (!Subtarget.hasSSE2())
40550 SDValue Op0 = N->getOperand(0);
40551 SDValue Op1 = N->getOperand(1);
40553 EVT VT = N->getValueType(0);
40555 // If the vector size is less than 128, or greater than the supported RegSize,
40556 // do not use PMADD.
40557 if (!VT.isVector() || VT.getVectorNumElements() < 8)
40560 if (Op0.getOpcode() != ISD::MUL)
40561 std::swap(Op0, Op1);
40562 if (Op0.getOpcode() != ISD::MUL)
40566 if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16)
40570 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
40571 VT.getVectorNumElements());
40572 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
40573 VT.getVectorNumElements() / 2);
40575 // Madd vector size is half of the original vector size
40576 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40577 ArrayRef<SDValue> Ops) {
40578 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
40579 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
40582 auto BuildPMADDWD = [&](SDValue Mul) {
40583 // Shrink the operands of mul.
40584 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
40585 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));
40587 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
40589 // Fill the rest of the output with 0
40590 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
40591 DAG.getConstant(0, DL, MAddVT));
40594 Op0 = BuildPMADDWD(Op0);
40596 // It's possible that Op1 is also a mul we can reduce.
40597 if (Op1.getOpcode() == ISD::MUL &&
40598 canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
40599 Op1 = BuildPMADDWD(Op1);
40602 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
40605 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
40606 const X86Subtarget &Subtarget) {
40607 if (!Subtarget.hasSSE2())
40611 EVT VT = N->getValueType(0);
40612 SDValue Op0 = N->getOperand(0);
40613 SDValue Op1 = N->getOperand(1);
40615 // TODO: There's nothing special about i32, any integer type above i16 should
40616 // work just as well.
40617 if (!VT.isVector() || !VT.isSimple() ||
40618 !(VT.getVectorElementType() == MVT::i32))
40621 unsigned RegSize = 128;
40622 if (Subtarget.useBWIRegs())
40624 else if (Subtarget.hasAVX())
40627 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
40628 // TODO: We should be able to handle larger vectors by splitting them before
40629 // feeding them into several SADs, and then reducing over those.
40630 if (VT.getSizeInBits() / 4 > RegSize)
40633 // We know N is a reduction add, which means one of its operands is a phi.
40634 // To match SAD, we need the other operand to be a vector select.
40635 if (Op0.getOpcode() != ISD::VSELECT)
40636 std::swap(Op0, Op1);
40637 if (Op0.getOpcode() != ISD::VSELECT)
40640 auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
40641 // SAD pattern detected. Now build a SAD instruction and an addition for
40642 // reduction. Note that the number of elements of the result of SAD is less
40643 // than the number of elements of its input. Therefore, we could only update
40644 // part of elements in the reduction vector.
40645 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
40647 // The output of PSADBW is a vector of i64.
40648 // We need to turn the vector of i64 into a vector of i32.
40649 // If the reduction vector is at least as wide as the psadbw result, just
40650 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
40652 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
40653 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
40654 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
40656 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
40658 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
40659 // Fill the upper elements with zero to match the add width.
40660 SDValue Zero = DAG.getConstant(0, DL, VT);
40661 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
40662 DAG.getIntPtrConstant(0, DL));
40668 // Check whether we have an abs-diff pattern feeding into the select.
40669 SDValue SadOp0, SadOp1;
40670 if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
40673 Op0 = BuildPSADBW(SadOp0, SadOp1);
40675 // It's possible we have a sad on the other side too.
40676 if (Op1.getOpcode() == ISD::VSELECT &&
40677 detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
40678 Op1 = BuildPSADBW(SadOp0, SadOp1);
40681 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
40684 /// Convert vector increment or decrement to sub/add with an all-ones constant:
40685 /// add X, <1, 1...> --> sub X, <-1, -1...>
40686 /// sub X, <1, 1...> --> add X, <-1, -1...>
40687 /// The all-ones vector constant can be materialized using a pcmpeq instruction
40688 /// that is commonly recognized as an idiom (has no register dependency), so
40689 /// that's better/smaller than loading a splat 1 constant.
40690 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
40691 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
40692 "Unexpected opcode for increment/decrement transform");
40694 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
40695 // out and wait for legalization if we have an unsupported vector length.
40696 EVT VT = N->getValueType(0);
40697 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
40701 if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
40704 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
40705 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
40706 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
40709 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
40710 const SDLoc &DL, EVT VT,
40711 const X86Subtarget &Subtarget) {
40712 // Example of pattern we try to detect:
40713 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
40714 //(add (build_vector (extract_elt t, 0),
40715 // (extract_elt t, 2),
40716 // (extract_elt t, 4),
40717 // (extract_elt t, 6)),
40718 // (build_vector (extract_elt t, 1),
40719 // (extract_elt t, 3),
40720 // (extract_elt t, 5),
40721 // (extract_elt t, 7)))
40723 if (!Subtarget.hasSSE2())
40726 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
40727 Op1.getOpcode() != ISD::BUILD_VECTOR)
40730 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
40731 VT.getVectorNumElements() < 4 ||
40732 !isPowerOf2_32(VT.getVectorNumElements()))
40735 // Check if one of Op0,Op1 is of the form:
40736 // (build_vector (extract_elt Mul, 0),
40737 // (extract_elt Mul, 2),
40738 // (extract_elt Mul, 4),
40740 // the other is of the form:
40741 // (build_vector (extract_elt Mul, 1),
40742 // (extract_elt Mul, 3),
40743 // (extract_elt Mul, 5),
40745 // and identify Mul.
40747 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
40748 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
40749 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
40750 // TODO: Be more tolerant to undefs.
40751 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
40752 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
40753 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
40754 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
40756 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
40757 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
40758 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
40759 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
40760 if (!Const0L || !Const1L || !Const0H || !Const1H)
40762 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
40763 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
40764 // Commutativity of mul allows factors of a product to reorder.
40766 std::swap(Idx0L, Idx1L);
40768 std::swap(Idx0H, Idx1H);
40769 // Commutativity of add allows pairs of factors to reorder.
40770 if (Idx0L > Idx0H) {
40771 std::swap(Idx0L, Idx0H);
40772 std::swap(Idx1L, Idx1H);
40774 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
40775 Idx1H != 2 * i + 3)
40778 // First time an extract_elt's source vector is visited. Must be a MUL
40779 // with 2X number of vector elements than the BUILD_VECTOR.
40780 // Both extracts must be from same MUL.
40781 Mul = Op0L->getOperand(0);
40782 if (Mul->getOpcode() != ISD::MUL ||
40783 Mul.getValueType().getVectorNumElements() != 2 * e)
40786 // Check that the extract is from the same MUL previously seen.
40787 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
40788 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
40792 // Check if the Mul source can be safely shrunk.
40794 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
40797 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40798 ArrayRef<SDValue> Ops) {
40799 // Shrink by adding truncate nodes and let DAGCombine fold with the
40801 EVT InVT = Ops[0].getValueType();
40802 assert(InVT.getScalarType() == MVT::i32 &&
40803 "Unexpected scalar element type");
40804 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
40805 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
40806 InVT.getVectorNumElements() / 2);
40807 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
40808 InVT.getVectorNumElements());
40809 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
40810 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
40811 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
40813 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
40814 { Mul.getOperand(0), Mul.getOperand(1) },
40818 // Try to turn (add (umax X, C), -C) into (psubus X, C)
40819 static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG,
40820 const X86Subtarget &Subtarget) {
40821 if (!Subtarget.hasSSE2())
40824 EVT VT = N->getValueType(0);
40826 // psubus is available in SSE2 for i8 and i16 vectors.
40827 if (!VT.isVector() || VT.getVectorNumElements() < 2 ||
40828 !isPowerOf2_32(VT.getVectorNumElements()) ||
40829 !(VT.getVectorElementType() == MVT::i8 ||
40830 VT.getVectorElementType() == MVT::i16))
40833 SDValue Op0 = N->getOperand(0);
40834 SDValue Op1 = N->getOperand(1);
40835 if (Op0.getOpcode() != ISD::UMAX)
40838 // The add should have a constant that is the negative of the max.
40839 // TODO: Handle build_vectors with undef elements.
40840 auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
40841 return Max->getAPIntValue() == (-Op->getAPIntValue());
40843 if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT))
40847 return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0),
40848 Op0.getOperand(1));
40851 // Attempt to turn this pattern into PMADDWD.
40852 // (mul (add (zext (build_vector)), (zext (build_vector))),
40853 // (add (zext (build_vector)), (zext (build_vector)))
40854 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
40855 const SDLoc &DL, EVT VT,
40856 const X86Subtarget &Subtarget) {
40857 if (!Subtarget.hasSSE2())
40860 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
40863 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
40864 VT.getVectorNumElements() < 4 ||
40865 !isPowerOf2_32(VT.getVectorNumElements()))
40868 SDValue N00 = N0.getOperand(0);
40869 SDValue N01 = N0.getOperand(1);
40870 SDValue N10 = N1.getOperand(0);
40871 SDValue N11 = N1.getOperand(1);
40873 // All inputs need to be sign extends.
40874 // TODO: Support ZERO_EXTEND from known positive?
40875 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
40876 N01.getOpcode() != ISD::SIGN_EXTEND ||
40877 N10.getOpcode() != ISD::SIGN_EXTEND ||
40878 N11.getOpcode() != ISD::SIGN_EXTEND)
40881 // Peek through the extends.
40882 N00 = N00.getOperand(0);
40883 N01 = N01.getOperand(0);
40884 N10 = N10.getOperand(0);
40885 N11 = N11.getOperand(0);
40887 // Must be extending from vXi16.
40888 EVT InVT = N00.getValueType();
40889 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
40890 N10.getValueType() != InVT || N11.getValueType() != InVT)
40893 // All inputs should be build_vectors.
40894 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
40895 N01.getOpcode() != ISD::BUILD_VECTOR ||
40896 N10.getOpcode() != ISD::BUILD_VECTOR ||
40897 N11.getOpcode() != ISD::BUILD_VECTOR)
40900 // For each element, we need to ensure we have an odd element from one vector
40901 // multiplied by the odd element of another vector and the even element from
40902 // one of the same vectors being multiplied by the even element from the
40903 // other vector. So we need to make sure for each element i, this operator
40904 // is being performed:
40905 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
40907 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
40908 SDValue N00Elt = N00.getOperand(i);
40909 SDValue N01Elt = N01.getOperand(i);
40910 SDValue N10Elt = N10.getOperand(i);
40911 SDValue N11Elt = N11.getOperand(i);
40912 // TODO: Be more tolerant to undefs.
40913 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
40914 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
40915 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
40916 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
40918 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
40919 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
40920 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
40921 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
40922 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
40924 unsigned IdxN00 = ConstN00Elt->getZExtValue();
40925 unsigned IdxN01 = ConstN01Elt->getZExtValue();
40926 unsigned IdxN10 = ConstN10Elt->getZExtValue();
40927 unsigned IdxN11 = ConstN11Elt->getZExtValue();
40928 // Add is commutative so indices can be reordered.
40929 if (IdxN00 > IdxN10) {
40930 std::swap(IdxN00, IdxN10);
40931 std::swap(IdxN01, IdxN11);
40933 // N0 indices be the even element. N1 indices must be the next odd element.
40934 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
40935 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
40937 SDValue N00In = N00Elt.getOperand(0);
40938 SDValue N01In = N01Elt.getOperand(0);
40939 SDValue N10In = N10Elt.getOperand(0);
40940 SDValue N11In = N11Elt.getOperand(0);
40941 // First time we find an input capture it.
40946 // Mul is commutative so the input vectors can be in any order.
40947 // Canonicalize to make the compares easier.
40949 std::swap(N00In, N01In);
40951 std::swap(N10In, N11In);
40952 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
40956 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40957 ArrayRef<SDValue> Ops) {
40958 // Shrink by adding truncate nodes and let DAGCombine fold with the
40960 EVT InVT = Ops[0].getValueType();
40961 assert(InVT.getScalarType() == MVT::i16 &&
40962 "Unexpected scalar element type");
40963 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
40964 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
40965 InVT.getVectorNumElements() / 2);
40966 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
40968 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
40972 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
40973 const X86Subtarget &Subtarget) {
40974 const SDNodeFlags Flags = N->getFlags();
40975 if (Flags.hasVectorReduction()) {
40976 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
40978 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
40981 EVT VT = N->getValueType(0);
40982 SDValue Op0 = N->getOperand(0);
40983 SDValue Op1 = N->getOperand(1);
40985 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
40987 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
40990 // Try to synthesize horizontal adds from adds of shuffles.
40991 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
40992 VT == MVT::v8i32) &&
40993 Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
40994 shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
40995 auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40996 ArrayRef<SDValue> Ops) {
40997 return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
40999 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
41003 if (SDValue V = combineIncDecVector(N, DAG))
41006 if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget))
41009 return combineAddOrSubToADCOrSBB(N, DAG);
41012 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
41013 const X86Subtarget &Subtarget) {
41014 SDValue Op0 = N->getOperand(0);
41015 SDValue Op1 = N->getOperand(1);
41016 EVT VT = N->getValueType(0);
41018 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
41019 // is only worth it with SSSE3 (PSHUFB).
41020 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
41021 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
41022 !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
41023 !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
41024 VT == MVT::v16i32 || VT == MVT::v8i64)))
41027 SDValue SubusLHS, SubusRHS;
41028 // Try to find umax(a,b) - b or a - umin(a,b) patterns
41029 // they may be converted to subus(a,b).
41030 // TODO: Need to add IR canonicalization for this code.
41031 if (Op0.getOpcode() == ISD::UMAX) {
41033 SDValue MaxLHS = Op0.getOperand(0);
41034 SDValue MaxRHS = Op0.getOperand(1);
41037 else if (MaxRHS == Op1)
41041 } else if (Op1.getOpcode() == ISD::UMIN) {
41043 SDValue MinLHS = Op1.getOperand(0);
41044 SDValue MinRHS = Op1.getOperand(1);
41047 else if (MinRHS == Op0)
41054 auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
41055 ArrayRef<SDValue> Ops) {
41056 return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
41059 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
41060 // special preprocessing in some cases.
41061 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
41062 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
41063 { SubusLHS, SubusRHS }, USUBSATBuilder);
41065 // Special preprocessing case can be only applied
41066 // if the value was zero extended from 16 bit,
41067 // so we require first 16 bits to be zeros for 32 bit
41068 // values, or first 48 bits for 64 bit values.
41069 KnownBits Known = DAG.computeKnownBits(SubusLHS);
41070 unsigned NumZeros = Known.countMinLeadingZeros();
41071 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
41074 EVT ExtType = SubusLHS.getValueType();
41076 if (VT == MVT::v8i32 || VT == MVT::v8i64)
41077 ShrinkedType = MVT::v8i16;
41079 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
41081 // If SubusLHS is zeroextended - truncate SubusRHS to it's
41082 // size SubusRHS = umin(0xFFF.., SubusRHS).
41083 SDValue SaturationConst =
41084 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
41085 ShrinkedType.getScalarSizeInBits()),
41086 SDLoc(SubusLHS), ExtType);
41087 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
41089 SDValue NewSubusLHS =
41090 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
41091 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
41093 SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
41094 { NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
41095 // Zero extend the result, it may be used somewhere as 32 bit,
41096 // if not zext and following trunc will shrink.
41097 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
41100 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
41101 const X86Subtarget &Subtarget) {
41102 SDValue Op0 = N->getOperand(0);
41103 SDValue Op1 = N->getOperand(1);
41105 // X86 can't encode an immediate LHS of a sub. See if we can push the
41106 // negation into a preceding instruction.
41107 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
41108 // If the RHS of the sub is a XOR with one use and a constant, invert the
41109 // immediate. Then add one to the LHS of the sub so we can turn
41110 // X-Y -> X+~Y+1, saving one register.
41111 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
41112 isa<ConstantSDNode>(Op1.getOperand(1))) {
41113 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
41114 EVT VT = Op0.getValueType();
41115 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
41117 DAG.getConstant(~XorC, SDLoc(Op1), VT));
41118 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
41119 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
41123 // Try to synthesize horizontal subs from subs of shuffles.
41124 EVT VT = N->getValueType(0);
41125 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
41126 VT == MVT::v8i32) &&
41127 Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
41128 shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
41129 auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
41130 ArrayRef<SDValue> Ops) {
41131 return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
41133 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
41137 if (SDValue V = combineIncDecVector(N, DAG))
41140 // Try to create PSUBUS if SUB's argument is max/min
41141 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
41144 return combineAddOrSubToADCOrSBB(N, DAG);
41147 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
41148 const X86Subtarget &Subtarget) {
41149 MVT VT = N->getSimpleValueType(0);
41152 if (N->getOperand(0) == N->getOperand(1)) {
41153 if (N->getOpcode() == X86ISD::PCMPEQ)
41154 return DAG.getConstant(-1, DL, VT);
41155 if (N->getOpcode() == X86ISD::PCMPGT)
41156 return DAG.getConstant(0, DL, VT);
41162 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
41163 TargetLowering::DAGCombinerInfo &DCI,
41164 const X86Subtarget &Subtarget) {
41165 if (DCI.isBeforeLegalizeOps())
41168 MVT OpVT = N->getSimpleValueType(0);
41170 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
41173 SDValue Vec = N->getOperand(0);
41174 SDValue SubVec = N->getOperand(1);
41176 unsigned IdxVal = N->getConstantOperandVal(2);
41177 MVT SubVecVT = SubVec.getSimpleValueType();
41179 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
41180 // Inserting zeros into zeros is a nop.
41181 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
41182 return getZeroVector(OpVT, Subtarget, DAG, dl);
41184 // If we're inserting into a zero vector and then into a larger zero vector,
41185 // just insert into the larger zero vector directly.
41186 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
41187 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
41188 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
41189 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
41190 getZeroVector(OpVT, Subtarget, DAG, dl),
41191 SubVec.getOperand(1),
41192 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
41195 // If we're inserting into a zero vector and our input was extracted from an
41196 // insert into a zero vector of the same type and the extraction was at
41197 // least as large as the original insertion. Just insert the original
41198 // subvector into a zero vector.
41199 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
41200 SubVec.getConstantOperandVal(1) == 0 &&
41201 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
41202 SDValue Ins = SubVec.getOperand(0);
41203 if (Ins.getConstantOperandVal(2) == 0 &&
41204 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
41205 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
41206 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
41207 getZeroVector(OpVT, Subtarget, DAG, dl),
41208 Ins.getOperand(1), N->getOperand(2));
41211 // If we're inserting a bitcast into zeros, rewrite the insert and move the
41212 // bitcast to the other side. This helps with detecting zero extending
41214 // TODO: Is this useful for other indices than 0?
41215 if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
41216 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
41217 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
41218 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
41219 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
41220 DAG.getBitcast(NewVT, Vec),
41221 SubVec.getOperand(0), N->getOperand(2));
41222 return DAG.getBitcast(OpVT, Insert);
41226 // Stop here if this is an i1 vector.
41230 // If this is an insert of an extract, combine to a shuffle. Don't do this
41231 // if the insert or extract can be represented with a subregister operation.
41232 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41233 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
41234 (IdxVal != 0 || !Vec.isUndef())) {
41235 int ExtIdxVal = SubVec.getConstantOperandVal(1);
41236 if (ExtIdxVal != 0) {
41237 int VecNumElts = OpVT.getVectorNumElements();
41238 int SubVecNumElts = SubVecVT.getVectorNumElements();
41239 SmallVector<int, 64> Mask(VecNumElts);
41240 // First create an identity shuffle mask.
41241 for (int i = 0; i != VecNumElts; ++i)
41243 // Now insert the extracted portion.
41244 for (int i = 0; i != SubVecNumElts; ++i)
41245 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
41247 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
41251 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
41253 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
41254 // (load16 addr + 16), Elts/2)
41257 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
41258 // (load32 addr + 32), Elts/2)
41260 // or a 16-byte or 32-byte broadcast:
41261 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
41262 // (load16 addr), Elts/2)
41263 // --> X86SubVBroadcast(load16 addr)
41265 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
41266 // (load32 addr), Elts/2)
41267 // --> X86SubVBroadcast(load32 addr)
41268 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
41269 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
41270 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
41271 if (isNullConstant(Vec.getOperand(2))) {
41272 SDValue SubVec2 = Vec.getOperand(1);
41273 // If needed, look through bitcasts to get to the load.
41274 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
41276 unsigned Alignment = FirstLd->getAlignment();
41277 unsigned AS = FirstLd->getAddressSpace();
41278 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
41279 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
41280 OpVT, AS, Alignment, &Fast) && Fast) {
41281 SDValue Ops[] = {SubVec2, SubVec};
41282 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
41287 // If lower/upper loads are the same and there's no other use of the lower
41288 // load, then splat the loaded value with a broadcast.
41289 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
41290 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && Vec.hasOneUse())
41291 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
41293 // If this is subv_broadcast insert into both halves, use a larger
41295 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
41296 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
41297 SubVec.getOperand(0));
41299 // If we're inserting all zeros into the upper half, change this to
41300 // an insert into an all zeros vector. We will match this to a move
41301 // with implicit upper bit zeroing during isel.
41302 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
41303 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
41304 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
41305 Vec.getOperand(2));
41307 // If we are inserting into both halves of the vector, the starting
41308 // vector should be undef. If it isn't, make it so. Only do this if the
41309 // the early insert has no other uses.
41310 // TODO: Should this be a generic DAG combine?
41311 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
41312 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
41313 SubVec2, Vec.getOperand(2));
41314 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
41324 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
41325 TargetLowering::DAGCombinerInfo &DCI,
41326 const X86Subtarget &Subtarget) {
41327 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
41328 // eventually get combined/lowered into ANDNP) with a concatenated operand,
41329 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
41330 // We let generic combining take over from there to simplify the
41331 // insert/extract and 'not'.
41332 // This pattern emerges during AVX1 legalization. We handle it before lowering
41333 // to avoid complications like splitting constant vector loads.
41335 // Capture the original wide type in the likely case that we need to bitcast
41336 // back to this type.
41337 EVT VT = N->getValueType(0);
41338 EVT WideVecVT = N->getOperand(0).getValueType();
41339 SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
41340 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41341 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
41342 TLI.isTypeLegal(WideVecVT) &&
41343 WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
41344 auto isConcatenatedNot = [] (SDValue V) {
41345 V = peekThroughBitcasts(V);
41346 if (!isBitwiseNot(V))
41348 SDValue NotOp = V->getOperand(0);
41349 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
41351 if (isConcatenatedNot(WideVec.getOperand(0)) ||
41352 isConcatenatedNot(WideVec.getOperand(1))) {
41353 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
41354 SDValue Concat = split256IntArith(WideVec, DAG);
41355 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
41356 DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
41360 if (DCI.isBeforeLegalizeOps())
41363 MVT OpVT = N->getSimpleValueType(0);
41364 SDValue InVec = N->getOperand(0);
41365 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
41367 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
41368 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
41370 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
41371 if (OpVT.getScalarType() == MVT::i1)
41372 return DAG.getConstant(1, SDLoc(N), OpVT);
41373 return getOnesVector(OpVT, DAG, SDLoc(N));
41376 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
41377 return DAG.getBuildVector(
41379 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
41381 // If we're extracting the lowest subvector and we're the only user,
41382 // we may be able to perform this with a smaller vector width.
41383 if (IdxVal == 0 && InVec.hasOneUse()) {
41384 unsigned InOpcode = InVec.getOpcode();
41385 if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
41386 // v2f64 CVTDQ2PD(v4i32).
41387 if (InOpcode == ISD::SINT_TO_FP &&
41388 InVec.getOperand(0).getValueType() == MVT::v4i32) {
41389 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
41391 // v2f64 CVTPS2PD(v4f32).
41392 if (InOpcode == ISD::FP_EXTEND &&
41393 InVec.getOperand(0).getValueType() == MVT::v4f32) {
41394 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
41397 if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) &&
41398 OpVT.is128BitVector() &&
41399 InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
41401 InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG
41402 : ISD::SIGN_EXTEND_VECTOR_INREG;
41403 return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
41405 if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
41406 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
41407 OpVT.is128BitVector() &&
41408 InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
41409 return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0));
41411 if (InOpcode == ISD::BITCAST) {
41412 // TODO - do this for target shuffles in general.
41413 SDValue InVecBC = peekThroughOneUseBitcasts(InVec);
41414 if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) {
41416 SDValue SubPSHUFB =
41417 DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
41418 extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL),
41419 extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL));
41420 return DAG.getBitcast(OpVT, SubPSHUFB);
41428 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
41429 EVT VT = N->getValueType(0);
41430 SDValue Src = N->getOperand(0);
41432 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
41433 // This occurs frequently in our masked scalar intrinsic code and our
41434 // floating point select lowering with AVX512.
41435 // TODO: SimplifyDemandedBits instead?
41436 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
41437 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
41438 if (C->getAPIntValue().isOneValue())
41439 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
41440 Src.getOperand(0));
41442 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
41443 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41444 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
41445 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
41446 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
41447 if (C->isNullValue())
41448 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
41449 Src.getOperand(0), Src.getOperand(1));
41454 // Simplify PMULDQ and PMULUDQ operations.
41455 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
41456 TargetLowering::DAGCombinerInfo &DCI) {
41457 SDValue LHS = N->getOperand(0);
41458 SDValue RHS = N->getOperand(1);
41460 // Canonicalize constant to RHS.
41461 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
41462 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
41463 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
41465 // Multiply by zero.
41466 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
41469 // Aggressively peek through ops to get at the demanded low bits.
41470 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
41471 SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
41472 SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
41473 if (DemandedLHS || DemandedRHS)
41474 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
41475 DemandedLHS ? DemandedLHS : LHS,
41476 DemandedRHS ? DemandedRHS : RHS);
41478 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
41479 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41480 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
41481 return SDValue(N, 0);
41486 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
41487 DAGCombinerInfo &DCI) const {
41488 SelectionDAG &DAG = DCI.DAG;
41489 switch (N->getOpcode()) {
41491 case ISD::SCALAR_TO_VECTOR:
41492 return combineScalarToVector(N, DAG);
41493 case ISD::EXTRACT_VECTOR_ELT:
41494 case X86ISD::PEXTRW:
41495 case X86ISD::PEXTRB:
41496 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
41497 case ISD::INSERT_SUBVECTOR:
41498 return combineInsertSubvector(N, DAG, DCI, Subtarget);
41499 case ISD::EXTRACT_SUBVECTOR:
41500 return combineExtractSubvector(N, DAG, DCI, Subtarget);
41503 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
41504 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
41505 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
41506 case X86ISD::CMP: return combineCMP(N, DAG);
41507 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
41508 case ISD::SUB: return combineSub(N, DAG, Subtarget);
41509 case X86ISD::SBB: return combineSBB(N, DAG);
41510 case X86ISD::ADC: return combineADC(N, DAG, DCI);
41511 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
41514 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
41515 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
41516 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
41517 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
41518 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
41519 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
41520 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
41521 case ISD::STORE: return combineStore(N, DAG, Subtarget);
41522 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
41523 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
41524 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
41526 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
41527 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
41528 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
41529 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
41530 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
41531 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
41533 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
41535 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
41537 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
41538 case X86ISD::CVTSI2P:
41539 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
41540 case X86ISD::BT: return combineBT(N, DAG, DCI);
41541 case ISD::ANY_EXTEND:
41542 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
41543 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
41544 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
41545 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
41546 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
41547 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
41548 case X86ISD::PACKSS:
41549 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
41553 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
41554 case X86ISD::VSHLI:
41555 case X86ISD::VSRAI:
41556 case X86ISD::VSRLI:
41557 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
41558 case X86ISD::PINSRB:
41559 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
41560 case X86ISD::SHUFP: // Handle all target specific shuffles
41561 case X86ISD::INSERTPS:
41562 case X86ISD::EXTRQI:
41563 case X86ISD::INSERTQI:
41564 case X86ISD::PALIGNR:
41565 case X86ISD::VSHLDQ:
41566 case X86ISD::VSRLDQ:
41567 case X86ISD::BLENDI:
41568 case X86ISD::UNPCKH:
41569 case X86ISD::UNPCKL:
41570 case X86ISD::MOVHLPS:
41571 case X86ISD::MOVLHPS:
41572 case X86ISD::PSHUFB:
41573 case X86ISD::PSHUFD:
41574 case X86ISD::PSHUFHW:
41575 case X86ISD::PSHUFLW:
41576 case X86ISD::MOVSHDUP:
41577 case X86ISD::MOVSLDUP:
41578 case X86ISD::MOVDDUP:
41579 case X86ISD::MOVSS:
41580 case X86ISD::MOVSD:
41581 case X86ISD::VBROADCAST:
41582 case X86ISD::VPPERM:
41583 case X86ISD::VPERMI:
41584 case X86ISD::VPERMV:
41585 case X86ISD::VPERMV3:
41586 case X86ISD::VPERMIL2:
41587 case X86ISD::VPERMILPI:
41588 case X86ISD::VPERMILPV:
41589 case X86ISD::VPERM2X128:
41590 case X86ISD::SHUF128:
41591 case X86ISD::VZEXT_MOVL:
41592 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
41593 case X86ISD::FMADD_RND:
41594 case X86ISD::FMSUB:
41595 case X86ISD::FMSUB_RND:
41596 case X86ISD::FNMADD:
41597 case X86ISD::FNMADD_RND:
41598 case X86ISD::FNMSUB:
41599 case X86ISD::FNMSUB_RND:
41600 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
41601 case X86ISD::FMADDSUB_RND:
41602 case X86ISD::FMSUBADD_RND:
41603 case X86ISD::FMADDSUB:
41604 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
41605 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
41606 case X86ISD::MGATHER:
41607 case X86ISD::MSCATTER:
41609 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
41610 case X86ISD::PCMPEQ:
41611 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
41612 case X86ISD::PMULDQ:
41613 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
41619 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
41620 if (!isTypeLegal(VT))
41623 // There are no vXi8 shifts.
41624 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
41627 // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
41628 // we have specializations to turn 32-bit multiply into LEA or other ops.
41629 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
41630 // check for a constant operand to the multiply.
41631 if (Opc == ISD::MUL && VT == MVT::i8)
41634 // i16 instruction encodings are longer and some i16 instructions are slow,
41635 // so those are not desirable.
41636 if (VT == MVT::i16) {
41641 case ISD::SIGN_EXTEND:
41642 case ISD::ZERO_EXTEND:
41643 case ISD::ANY_EXTEND:
41656 // Any legal type not explicitly accounted for above here is desirable.
41660 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
41661 SDValue Value, SDValue Addr,
41662 SelectionDAG &DAG) const {
41663 const Module *M = DAG.getMachineFunction().getMMI().getModule();
41664 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
41665 if (IsCFProtectionSupported) {
41666 // In case control-flow branch protection is enabled, we need to add
41667 // notrack prefix to the indirect branch.
41668 // In order to do that we create NT_BRIND SDNode.
41669 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
41670 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
41673 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
41676 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
41677 EVT VT = Op.getValueType();
41678 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
41679 isa<ConstantSDNode>(Op.getOperand(1));
41681 // i16 is legal, but undesirable since i16 instruction encodings are longer
41682 // and some i16 instructions are slow.
41683 // 8-bit multiply-by-constant can usually be expanded to something cheaper
41684 // using LEA and/or other ALU ops.
41685 if (VT != MVT::i16 && !Is8BitMulByConstant)
41688 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
41689 if (!Op.hasOneUse())
41691 SDNode *User = *Op->use_begin();
41692 if (!ISD::isNormalStore(User))
41694 auto *Ld = cast<LoadSDNode>(Load);
41695 auto *St = cast<StoreSDNode>(User);
41696 return Ld->getBasePtr() == St->getBasePtr();
41699 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
41700 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
41702 if (!Op.hasOneUse())
41704 SDNode *User = *Op->use_begin();
41705 if (User->getOpcode() != ISD::ATOMIC_STORE)
41707 auto *Ld = cast<AtomicSDNode>(Load);
41708 auto *St = cast<AtomicSDNode>(User);
41709 return Ld->getBasePtr() == St->getBasePtr();
41712 bool Commute = false;
41713 switch (Op.getOpcode()) {
41714 default: return false;
41715 case ISD::SIGN_EXTEND:
41716 case ISD::ZERO_EXTEND:
41717 case ISD::ANY_EXTEND:
41721 SDValue N0 = Op.getOperand(0);
41722 // Look out for (store (shl (load), x)).
41723 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
41735 SDValue N0 = Op.getOperand(0);
41736 SDValue N1 = Op.getOperand(1);
41737 // Avoid disabling potential load folding opportunities.
41738 if (MayFoldLoad(N1) &&
41739 (!Commute || !isa<ConstantSDNode>(N0) ||
41740 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
41742 if (MayFoldLoad(N0) &&
41743 ((Commute && !isa<ConstantSDNode>(N1)) ||
41744 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
41746 if (IsFoldableAtomicRMW(N0, Op) ||
41747 (Commute && IsFoldableAtomicRMW(N1, Op)))
41756 bool X86TargetLowering::
41757 isDesirableToCombineBuildVectorToShuffleTruncate(
41758 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
41760 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
41761 "Element count mismatch");
41763 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
41764 "Shuffle Mask expected to be legal");
41766 // For 32-bit elements VPERMD is better than shuffle+truncate.
41767 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
41768 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
41771 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
41777 //===----------------------------------------------------------------------===//
41778 // X86 Inline Assembly Support
41779 //===----------------------------------------------------------------------===//
41781 // Helper to match a string separated by whitespace.
41782 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
41783 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
41785 for (StringRef Piece : Pieces) {
41786 if (!S.startswith(Piece)) // Check if the piece matches.
41789 S = S.substr(Piece.size());
41790 StringRef::size_type Pos = S.find_first_not_of(" \t");
41791 if (Pos == 0) // We matched a prefix.
41800 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
41802 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
41803 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
41804 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
41805 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
41807 if (AsmPieces.size() == 3)
41809 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
41816 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
41817 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
41819 const std::string &AsmStr = IA->getAsmString();
41821 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
41822 if (!Ty || Ty->getBitWidth() % 16 != 0)
41825 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
41826 SmallVector<StringRef, 4> AsmPieces;
41827 SplitString(AsmStr, AsmPieces, ";\n");
41829 switch (AsmPieces.size()) {
41830 default: return false;
41832 // FIXME: this should verify that we are targeting a 486 or better. If not,
41833 // we will turn this bswap into something that will be lowered to logical
41834 // ops instead of emitting the bswap asm. For now, we don't support 486 or
41835 // lower so don't worry about this.
41837 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
41838 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
41839 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
41840 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
41841 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
41842 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
41843 // No need to check constraints, nothing other than the equivalent of
41844 // "=r,0" would be valid here.
41845 return IntrinsicLowering::LowerToByteSwap(CI);
41848 // rorw $$8, ${0:w} --> llvm.bswap.i16
41849 if (CI->getType()->isIntegerTy(16) &&
41850 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
41851 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
41852 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
41854 StringRef ConstraintsStr = IA->getConstraintString();
41855 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
41856 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
41857 if (clobbersFlagRegisters(AsmPieces))
41858 return IntrinsicLowering::LowerToByteSwap(CI);
41862 if (CI->getType()->isIntegerTy(32) &&
41863 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
41864 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
41865 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
41866 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
41868 StringRef ConstraintsStr = IA->getConstraintString();
41869 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
41870 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
41871 if (clobbersFlagRegisters(AsmPieces))
41872 return IntrinsicLowering::LowerToByteSwap(CI);
41875 if (CI->getType()->isIntegerTy(64)) {
41876 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
41877 if (Constraints.size() >= 2 &&
41878 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
41879 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
41880 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
41881 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
41882 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
41883 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
41884 return IntrinsicLowering::LowerToByteSwap(CI);
41892 /// Given a constraint letter, return the type of constraint for this target.
41893 X86TargetLowering::ConstraintType
41894 X86TargetLowering::getConstraintType(StringRef Constraint) const {
41895 if (Constraint.size() == 1) {
41896 switch (Constraint[0]) {
41908 case 'k': // AVX512 masking registers.
41909 return C_RegisterClass;
41933 else if (Constraint.size() == 2) {
41934 switch (Constraint[0]) {
41938 switch (Constraint[1]) {
41949 return C_RegisterClass;
41953 return TargetLowering::getConstraintType(Constraint);
41956 /// Examine constraint type and operand type and determine a weight value.
41957 /// This object must already have been set up with the operand type
41958 /// and the current alternative constraint selected.
41959 TargetLowering::ConstraintWeight
41960 X86TargetLowering::getSingleConstraintMatchWeight(
41961 AsmOperandInfo &info, const char *constraint) const {
41962 ConstraintWeight weight = CW_Invalid;
41963 Value *CallOperandVal = info.CallOperandVal;
41964 // If we don't have a value, we can't do a match,
41965 // but allow it at the lowest weight.
41966 if (!CallOperandVal)
41968 Type *type = CallOperandVal->getType();
41969 // Look at the constraint type.
41970 switch (*constraint) {
41972 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
41984 if (CallOperandVal->getType()->isIntegerTy())
41985 weight = CW_SpecificReg;
41990 if (type->isFloatingPointTy())
41991 weight = CW_SpecificReg;
41994 if (type->isX86_MMXTy() && Subtarget.hasMMX())
41995 weight = CW_SpecificReg;
41998 unsigned Size = StringRef(constraint).size();
41999 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
42000 char NextChar = Size == 2 ? constraint[1] : 'i';
42003 switch (NextChar) {
42009 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
42010 return CW_SpecificReg;
42012 // Conditional OpMask regs (AVX512)
42014 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
42015 return CW_Register;
42019 if (type->isX86_MMXTy() && Subtarget.hasMMX())
42022 // Any SSE reg when ISA >= SSE2, same as 'Y'
42026 if (!Subtarget.hasSSE2())
42030 // Fall through (handle "Y" constraint).
42034 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
42035 weight = CW_Register;
42038 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
42039 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
42040 weight = CW_Register;
42043 // Enable conditional vector operations using %k<#> registers.
42044 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
42045 weight = CW_Register;
42048 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
42049 if (C->getZExtValue() <= 31)
42050 weight = CW_Constant;
42054 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
42055 if (C->getZExtValue() <= 63)
42056 weight = CW_Constant;
42060 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
42061 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
42062 weight = CW_Constant;
42066 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
42067 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
42068 weight = CW_Constant;
42072 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
42073 if (C->getZExtValue() <= 3)
42074 weight = CW_Constant;
42078 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
42079 if (C->getZExtValue() <= 0xff)
42080 weight = CW_Constant;
42085 if (isa<ConstantFP>(CallOperandVal)) {
42086 weight = CW_Constant;
42090 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
42091 if ((C->getSExtValue() >= -0x80000000LL) &&
42092 (C->getSExtValue() <= 0x7fffffffLL))
42093 weight = CW_Constant;
42097 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
42098 if (C->getZExtValue() <= 0xffffffff)
42099 weight = CW_Constant;
42106 /// Try to replace an X constraint, which matches anything, with another that
42107 /// has more specific requirements based on the type of the corresponding
42109 const char *X86TargetLowering::
42110 LowerXConstraint(EVT ConstraintVT) const {
42111 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
42112 // 'f' like normal targets.
42113 if (ConstraintVT.isFloatingPoint()) {
42114 if (Subtarget.hasSSE2())
42116 if (Subtarget.hasSSE1())
42120 return TargetLowering::LowerXConstraint(ConstraintVT);
42123 /// Lower the specified operand into the Ops vector.
42124 /// If it is invalid, don't add anything to Ops.
42125 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
42126 std::string &Constraint,
42127 std::vector<SDValue>&Ops,
42128 SelectionDAG &DAG) const {
42131 // Only support length 1 constraints for now.
42132 if (Constraint.length() > 1) return;
42134 char ConstraintLetter = Constraint[0];
42135 switch (ConstraintLetter) {
42138 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42139 if (C->getZExtValue() <= 31) {
42140 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
42141 Op.getValueType());
42147 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42148 if (C->getZExtValue() <= 63) {
42149 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
42150 Op.getValueType());
42156 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42157 if (isInt<8>(C->getSExtValue())) {
42158 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
42159 Op.getValueType());
42165 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42166 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
42167 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
42168 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
42169 Op.getValueType());
42175 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42176 if (C->getZExtValue() <= 3) {
42177 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
42178 Op.getValueType());
42184 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42185 if (C->getZExtValue() <= 255) {
42186 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
42187 Op.getValueType());
42193 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42194 if (C->getZExtValue() <= 127) {
42195 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
42196 Op.getValueType());
42202 // 32-bit signed value
42203 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42204 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
42205 C->getSExtValue())) {
42206 // Widen to 64 bits here to get it sign extended.
42207 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
42210 // FIXME gcc accepts some relocatable values here too, but only in certain
42211 // memory models; it's complicated.
42216 // 32-bit unsigned value
42217 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
42218 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
42219 C->getZExtValue())) {
42220 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
42221 Op.getValueType());
42225 // FIXME gcc accepts some relocatable values here too, but only in certain
42226 // memory models; it's complicated.
42230 // Literal immediates are always ok.
42231 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
42232 // Widen to 64 bits here to get it sign extended.
42233 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
42237 // In any sort of PIC mode addresses need to be computed at runtime by
42238 // adding in a register or some sort of table lookup. These can't
42239 // be used as immediates.
42240 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
42243 // If we are in non-pic codegen mode, we allow the address of a global (with
42244 // an optional displacement) to be used with 'i'.
42245 GlobalAddressSDNode *GA = nullptr;
42246 int64_t Offset = 0;
42248 // Match either (GA), (GA+C), (GA+C1+C2), etc.
42250 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
42251 Offset += GA->getOffset();
42253 } else if (Op.getOpcode() == ISD::ADD) {
42254 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
42255 Offset += C->getZExtValue();
42256 Op = Op.getOperand(0);
42259 } else if (Op.getOpcode() == ISD::SUB) {
42260 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
42261 Offset += -C->getZExtValue();
42262 Op = Op.getOperand(0);
42267 // Otherwise, this isn't something we can handle, reject it.
42271 const GlobalValue *GV = GA->getGlobal();
42272 // If we require an extra load to get this address, as in PIC mode, we
42273 // can't accept it.
42274 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
42277 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
42278 GA->getValueType(0), Offset);
42283 if (Result.getNode()) {
42284 Ops.push_back(Result);
42287 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
42290 /// Check if \p RC is a general purpose register class.
42291 /// I.e., GR* or one of their variant.
42292 static bool isGRClass(const TargetRegisterClass &RC) {
42293 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
42294 RC.hasSuperClassEq(&X86::GR16RegClass) ||
42295 RC.hasSuperClassEq(&X86::GR32RegClass) ||
42296 RC.hasSuperClassEq(&X86::GR64RegClass) ||
42297 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
42300 /// Check if \p RC is a vector register class.
42301 /// I.e., FR* / VR* or one of their variant.
42302 static bool isFRClass(const TargetRegisterClass &RC) {
42303 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
42304 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
42305 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
42306 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
42307 RC.hasSuperClassEq(&X86::VR512RegClass);
42310 std::pair<unsigned, const TargetRegisterClass *>
42311 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
42312 StringRef Constraint,
42314 // First, see if this is a constraint that directly corresponds to an LLVM
42316 if (Constraint.size() == 1) {
42317 // GCC Constraint Letters
42318 switch (Constraint[0]) {
42320 // TODO: Slight differences here in allocation order and leaving
42321 // RIP in the class. Do they matter any more here than they do
42322 // in the normal allocation?
42324 if (Subtarget.hasAVX512()) {
42325 // Only supported in AVX512 or later.
42326 switch (VT.SimpleTy) {
42329 return std::make_pair(0U, &X86::VK32RegClass);
42331 return std::make_pair(0U, &X86::VK16RegClass);
42333 return std::make_pair(0U, &X86::VK8RegClass);
42335 return std::make_pair(0U, &X86::VK1RegClass);
42337 return std::make_pair(0U, &X86::VK64RegClass);
42341 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
42342 if (Subtarget.is64Bit()) {
42343 if (VT == MVT::i32 || VT == MVT::f32)
42344 return std::make_pair(0U, &X86::GR32RegClass);
42345 if (VT == MVT::i16)
42346 return std::make_pair(0U, &X86::GR16RegClass);
42347 if (VT == MVT::i8 || VT == MVT::i1)
42348 return std::make_pair(0U, &X86::GR8RegClass);
42349 if (VT == MVT::i64 || VT == MVT::f64)
42350 return std::make_pair(0U, &X86::GR64RegClass);
42354 // 32-bit fallthrough
42355 case 'Q': // Q_REGS
42356 if (VT == MVT::i32 || VT == MVT::f32)
42357 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
42358 if (VT == MVT::i16)
42359 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
42360 if (VT == MVT::i8 || VT == MVT::i1)
42361 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
42362 if (VT == MVT::i64)
42363 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
42365 case 'r': // GENERAL_REGS
42366 case 'l': // INDEX_REGS
42367 if (VT == MVT::i8 || VT == MVT::i1)
42368 return std::make_pair(0U, &X86::GR8RegClass);
42369 if (VT == MVT::i16)
42370 return std::make_pair(0U, &X86::GR16RegClass);
42371 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
42372 return std::make_pair(0U, &X86::GR32RegClass);
42373 return std::make_pair(0U, &X86::GR64RegClass);
42374 case 'R': // LEGACY_REGS
42375 if (VT == MVT::i8 || VT == MVT::i1)
42376 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
42377 if (VT == MVT::i16)
42378 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
42379 if (VT == MVT::i32 || !Subtarget.is64Bit())
42380 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
42381 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
42382 case 'f': // FP Stack registers.
42383 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
42384 // value to the correct fpstack register class.
42385 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
42386 return std::make_pair(0U, &X86::RFP32RegClass);
42387 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
42388 return std::make_pair(0U, &X86::RFP64RegClass);
42389 return std::make_pair(0U, &X86::RFP80RegClass);
42390 case 'y': // MMX_REGS if MMX allowed.
42391 if (!Subtarget.hasMMX()) break;
42392 return std::make_pair(0U, &X86::VR64RegClass);
42393 case 'Y': // SSE_REGS if SSE2 allowed
42394 if (!Subtarget.hasSSE2()) break;
42397 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
42398 if (!Subtarget.hasSSE1()) break;
42399 bool VConstraint = (Constraint[0] == 'v');
42401 switch (VT.SimpleTy) {
42403 // Scalar SSE types.
42406 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
42407 return std::make_pair(0U, &X86::FR32XRegClass);
42408 return std::make_pair(0U, &X86::FR32RegClass);
42411 if (VConstraint && Subtarget.hasVLX())
42412 return std::make_pair(0U, &X86::FR64XRegClass);
42413 return std::make_pair(0U, &X86::FR64RegClass);
42414 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
42422 if (VConstraint && Subtarget.hasVLX())
42423 return std::make_pair(0U, &X86::VR128XRegClass);
42424 return std::make_pair(0U, &X86::VR128RegClass);
42432 if (VConstraint && Subtarget.hasVLX())
42433 return std::make_pair(0U, &X86::VR256XRegClass);
42434 return std::make_pair(0U, &X86::VR256RegClass);
42439 return std::make_pair(0U, &X86::VR512RegClass);
42443 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
42444 switch (Constraint[1]) {
42450 return getRegForInlineAsmConstraint(TRI, "Y", VT);
42452 if (!Subtarget.hasMMX()) break;
42453 return std::make_pair(0U, &X86::VR64RegClass);
42456 if (!Subtarget.hasSSE1()) break;
42457 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
42459 // This register class doesn't allocate k0 for masked vector operation.
42460 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
42461 switch (VT.SimpleTy) {
42464 return std::make_pair(0U, &X86::VK32WMRegClass);
42466 return std::make_pair(0U, &X86::VK16WMRegClass);
42468 return std::make_pair(0U, &X86::VK8WMRegClass);
42470 return std::make_pair(0U, &X86::VK1WMRegClass);
42472 return std::make_pair(0U, &X86::VK64WMRegClass);
42479 // Use the default implementation in TargetLowering to convert the register
42480 // constraint into a member of a register class.
42481 std::pair<unsigned, const TargetRegisterClass*> Res;
42482 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
42484 // Not found as a standard register?
42486 // Map st(0) -> st(7) -> ST0
42487 if (Constraint.size() == 7 && Constraint[0] == '{' &&
42488 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
42489 Constraint[3] == '(' &&
42490 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
42491 Constraint[5] == ')' && Constraint[6] == '}') {
42492 // st(7) is not allocatable and thus not a member of RFP80. Return
42493 // singleton class in cases where we have a reference to it.
42494 if (Constraint[4] == '7')
42495 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
42496 return std::make_pair(X86::FP0 + Constraint[4] - '0',
42497 &X86::RFP80RegClass);
42500 // GCC allows "st(0)" to be called just plain "st".
42501 if (StringRef("{st}").equals_lower(Constraint))
42502 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
42505 if (StringRef("{flags}").equals_lower(Constraint))
42506 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
42508 // 'A' means [ER]AX + [ER]DX.
42509 if (Constraint == "A") {
42510 if (Subtarget.is64Bit())
42511 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
42512 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
42513 "Expecting 64, 32 or 16 bit subtarget");
42514 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
42519 // Make sure it isn't a register that requires 64-bit mode.
42520 if (!Subtarget.is64Bit() &&
42521 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
42522 TRI->getEncodingValue(Res.first) >= 8) {
42523 // Register requires REX prefix, but we're in 32-bit mode.
42524 return std::make_pair(0, nullptr);
42527 // Make sure it isn't a register that requires AVX512.
42528 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
42529 TRI->getEncodingValue(Res.first) & 0x10) {
42530 // Register requires EVEX prefix.
42531 return std::make_pair(0, nullptr);
42534 // Otherwise, check to see if this is a register class of the wrong value
42535 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
42536 // turn into {ax},{dx}.
42537 // MVT::Other is used to specify clobber names.
42538 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
42539 return Res; // Correct type already, nothing to do.
42541 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
42542 // return "eax". This should even work for things like getting 64bit integer
42543 // registers when given an f64 type.
42544 const TargetRegisterClass *Class = Res.second;
42545 // The generic code will match the first register class that contains the
42546 // given register. Thus, based on the ordering of the tablegened file,
42547 // the "plain" GR classes might not come first.
42548 // Therefore, use a helper method.
42549 if (isGRClass(*Class)) {
42550 unsigned Size = VT.getSizeInBits();
42551 if (Size == 1) Size = 8;
42552 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
42554 bool is64Bit = Subtarget.is64Bit();
42555 const TargetRegisterClass *RC =
42556 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
42557 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
42558 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
42559 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
42561 if (Size == 64 && !is64Bit) {
42562 // Model GCC's behavior here and select a fixed pair of 32-bit
42564 switch (Res.first) {
42566 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
42568 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
42570 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
42572 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
42574 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
42576 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
42578 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
42580 return std::make_pair(0, nullptr);
42583 if (RC && RC->contains(DestReg))
42584 return std::make_pair(DestReg, RC);
42587 // No register found/type mismatch.
42588 return std::make_pair(0, nullptr);
42589 } else if (isFRClass(*Class)) {
42590 // Handle references to XMM physical registers that got mapped into the
42591 // wrong class. This can happen with constraints like {xmm0} where the
42592 // target independent register mapper will just pick the first match it can
42593 // find, ignoring the required type.
42595 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
42596 if (VT == MVT::f32 || VT == MVT::i32)
42597 Res.second = &X86::FR32RegClass;
42598 else if (VT == MVT::f64 || VT == MVT::i64)
42599 Res.second = &X86::FR64RegClass;
42600 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
42601 Res.second = &X86::VR128RegClass;
42602 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
42603 Res.second = &X86::VR256RegClass;
42604 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
42605 Res.second = &X86::VR512RegClass;
42607 // Type mismatch and not a clobber: Return an error;
42609 Res.second = nullptr;
42616 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
42617 const AddrMode &AM, Type *Ty,
42618 unsigned AS) const {
42619 // Scaling factors are not free at all.
42620 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
42621 // will take 2 allocations in the out of order engine instead of 1
42622 // for plain addressing mode, i.e. inst (reg1).
42624 // vaddps (%rsi,%rdx), %ymm0, %ymm1
42625 // Requires two allocations (one for the load, one for the computation)
42627 // vaddps (%rsi), %ymm0, %ymm1
42628 // Requires just 1 allocation, i.e., freeing allocations for other operations
42629 // and having less micro operations to execute.
42631 // For some X86 architectures, this is even worse because for instance for
42632 // stores, the complex addressing mode forces the instruction to use the
42633 // "load" ports instead of the dedicated "store" port.
42634 // E.g., on Haswell:
42635 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
42636 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
42637 if (isLegalAddressingMode(DL, AM, Ty, AS))
42638 // Scale represents reg2 * scale, thus account for 1
42639 // as soon as we use a second register.
42640 return AM.Scale != 0;
42644 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
42645 // Integer division on x86 is expensive. However, when aggressively optimizing
42646 // for code size, we prefer to use a div instruction, as it is usually smaller
42647 // than the alternative sequence.
42648 // The exception to this is vector division. Since x86 doesn't have vector
42649 // integer division, leaving the division as-is is a loss even in terms of
42650 // size, because it will have to be scalarized, while the alternative code
42651 // sequence can be performed in vector form.
42653 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
42654 return OptSize && !VT.isVector();
42657 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
42658 if (!Subtarget.is64Bit())
42661 // Update IsSplitCSR in X86MachineFunctionInfo.
42662 X86MachineFunctionInfo *AFI =
42663 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
42664 AFI->setIsSplitCSR(true);
42667 void X86TargetLowering::insertCopiesSplitCSR(
42668 MachineBasicBlock *Entry,
42669 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
42670 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
42671 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
42675 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
42676 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
42677 MachineBasicBlock::iterator MBBI = Entry->begin();
42678 for (const MCPhysReg *I = IStart; *I; ++I) {
42679 const TargetRegisterClass *RC = nullptr;
42680 if (X86::GR64RegClass.contains(*I))
42681 RC = &X86::GR64RegClass;
42683 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
42685 unsigned NewVR = MRI->createVirtualRegister(RC);
42686 // Create copy from CSR to a virtual register.
42687 // FIXME: this currently does not emit CFI pseudo-instructions, it works
42688 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
42689 // nounwind. If we want to generalize this later, we may need to emit
42690 // CFI pseudo-instructions.
42691 assert(Entry->getParent()->getFunction().hasFnAttribute(
42692 Attribute::NoUnwind) &&
42693 "Function should be nounwind in insertCopiesSplitCSR!");
42694 Entry->addLiveIn(*I);
42695 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
42698 // Insert the copy-back instructions right before the terminator.
42699 for (auto *Exit : Exits)
42700 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
42701 TII->get(TargetOpcode::COPY), *I)
42706 bool X86TargetLowering::supportSwiftError() const {
42707 return Subtarget.is64Bit();
42710 /// Returns the name of the symbol used to emit stack probes or the empty
42711 /// string if not applicable.
42712 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
42713 // If the function specifically requests stack probes, emit them.
42714 if (MF.getFunction().hasFnAttribute("probe-stack"))
42715 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
42717 // Generally, if we aren't on Windows, the platform ABI does not include
42718 // support for stack probes, so don't emit them.
42719 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
42720 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
42723 // We need a stack probe to conform to the Windows ABI. Choose the right
42725 if (Subtarget.is64Bit())
42726 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
42727 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";