1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
221 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
223 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
226 if (!Subtarget.useSoftFloat()) {
227 // SSE has no i16 to fp conversion, only i32.
228 if (X86ScalarSSEf32) {
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 // f32 and f64 cases are Legal, f80 case is not
231 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
233 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
234 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
237 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
238 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
241 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
243 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
246 if (!Subtarget.useSoftFloat()) {
247 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
248 // are Legal, f80 is custom lowered.
249 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
250 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
252 if (X86ScalarSSEf32) {
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 // f32 and f64 cases are Legal, f80 case is not
255 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
257 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
258 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
261 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
262 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
266 // Handle FP_TO_UINT by promoting the destination to a larger signed
268 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
269 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
272 if (Subtarget.is64Bit()) {
273 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
274 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
275 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
276 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
279 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
281 } else if (!Subtarget.useSoftFloat()) {
282 // Since AVX is a superset of SSE3, only check for SSE here.
283 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
284 // Expand FP_TO_UINT into a select.
285 // FIXME: We would like to use a Custom expander here eventually to do
286 // the optimal thing for SSE vs. the default expansion in the legalizer.
287 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
289 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
290 // With SSE3 we can use fisttpll to convert to a signed i64; without
291 // SSE, we're stuck with a fistpll.
292 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
294 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
297 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
298 if (!X86ScalarSSEf64) {
299 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
300 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
301 if (Subtarget.is64Bit()) {
302 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
303 // Without SSE, i64->f64 goes through memory.
304 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
306 } else if (!Subtarget.is64Bit())
307 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
309 // Scalar integer divide and remainder are lowered to use operations that
310 // produce two results, to match the available instructions. This exposes
311 // the two-result form to trivial CSE, which is able to combine x/y and x%y
312 // into a single instruction.
314 // Scalar integer multiply-high is also lowered to use two-result
315 // operations, to match the available instructions. However, plain multiply
316 // (low) operations are left as Legal, as there are single-result
317 // instructions for this in x86. Using the two-result multiply instructions
318 // when both high and low results are needed must be arranged by dagcombine.
319 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
320 setOperationAction(ISD::MULHS, VT, Expand);
321 setOperationAction(ISD::MULHU, VT, Expand);
322 setOperationAction(ISD::SDIV, VT, Expand);
323 setOperationAction(ISD::UDIV, VT, Expand);
324 setOperationAction(ISD::SREM, VT, Expand);
325 setOperationAction(ISD::UREM, VT, Expand);
328 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
329 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
330 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
331 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
332 setOperationAction(ISD::BR_CC, VT, Expand);
333 setOperationAction(ISD::SELECT_CC, VT, Expand);
335 if (Subtarget.is64Bit())
336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
337 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
340 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
342 setOperationAction(ISD::FREM , MVT::f32 , Expand);
343 setOperationAction(ISD::FREM , MVT::f64 , Expand);
344 setOperationAction(ISD::FREM , MVT::f80 , Expand);
345 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
347 // Promote the i8 variants and force them on up to i32 which has a shorter
349 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
350 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 if (!Subtarget.hasBMI()) {
352 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
355 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
356 if (Subtarget.is64Bit()) {
357 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
358 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
362 if (Subtarget.hasLZCNT()) {
363 // When promoting the i8 variants, force them to i32 for a shorter
365 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
366 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
368 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
369 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
370 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
374 if (Subtarget.is64Bit()) {
375 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
376 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
380 // Special handling for half-precision floating point conversions.
381 // If we don't have F16C support, then lower half float conversions
382 // into library calls.
383 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
384 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
392 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
394 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
395 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
397 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
398 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
401 if (Subtarget.hasPOPCNT()) {
402 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
404 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
405 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
406 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
411 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
413 if (!Subtarget.hasMOVBE())
414 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
416 // These should be promoted to a larger select which is supported.
417 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
418 // X86 wants to expand cmov itself.
419 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
420 setOperationAction(ISD::SELECT, VT, Custom);
421 setOperationAction(ISD::SETCC, VT, Custom);
423 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424 if (VT == MVT::i64 && !Subtarget.is64Bit())
426 setOperationAction(ISD::SELECT, VT, Custom);
427 setOperationAction(ISD::SETCC, VT, Custom);
430 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
431 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
432 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
434 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
435 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
438 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
439 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
440 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
441 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
444 for (auto VT : { MVT::i32, MVT::i64 }) {
445 if (VT == MVT::i64 && !Subtarget.is64Bit())
447 setOperationAction(ISD::ConstantPool , VT, Custom);
448 setOperationAction(ISD::JumpTable , VT, Custom);
449 setOperationAction(ISD::GlobalAddress , VT, Custom);
450 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
451 setOperationAction(ISD::ExternalSymbol , VT, Custom);
452 setOperationAction(ISD::BlockAddress , VT, Custom);
455 // 64-bit shl, sra, srl (iff 32-bit x86)
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
459 setOperationAction(ISD::SHL_PARTS, VT, Custom);
460 setOperationAction(ISD::SRA_PARTS, VT, Custom);
461 setOperationAction(ISD::SRL_PARTS, VT, Custom);
464 if (Subtarget.hasSSE1())
465 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
467 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
469 // Expand certain atomics
470 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
477 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
480 if (Subtarget.hasCmpxchg16b()) {
481 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
484 // FIXME - use subtarget debug flags
485 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
486 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
487 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
488 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
491 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
492 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
494 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
495 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::TRAP, MVT::Other, Legal);
498 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
500 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
501 setOperationAction(ISD::VASTART , MVT::Other, Custom);
502 setOperationAction(ISD::VAEND , MVT::Other, Expand);
503 bool Is64Bit = Subtarget.is64Bit();
504 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
505 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
508 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
510 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
512 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
513 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
514 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
516 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
517 // f32 and f64 use SSE.
518 // Set up the FP register classes.
519 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
520 : &X86::FR32RegClass);
521 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
522 : &X86::FR64RegClass);
524 for (auto VT : { MVT::f32, MVT::f64 }) {
525 // Use ANDPD to simulate FABS.
526 setOperationAction(ISD::FABS, VT, Custom);
528 // Use XORP to simulate FNEG.
529 setOperationAction(ISD::FNEG, VT, Custom);
531 // Use ANDPD and ORPD to simulate FCOPYSIGN.
532 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
534 // We don't support sin/cos/fmod
535 setOperationAction(ISD::FSIN , VT, Expand);
536 setOperationAction(ISD::FCOS , VT, Expand);
537 setOperationAction(ISD::FSINCOS, VT, Expand);
540 // Lower this to MOVMSK plus an AND.
541 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
542 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
544 // Expand FP immediates into loads from the stack, except for the special
546 addLegalFPImmediate(APFloat(+0.0)); // xorpd
547 addLegalFPImmediate(APFloat(+0.0f)); // xorps
548 } else if (UseX87 && X86ScalarSSEf32) {
549 // Use SSE for f32, x87 for f64.
550 // Set up the FP register classes.
551 addRegisterClass(MVT::f32, &X86::FR32RegClass);
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
554 // Use ANDPS to simulate FABS.
555 setOperationAction(ISD::FABS , MVT::f32, Custom);
557 // Use XORP to simulate FNEG.
558 setOperationAction(ISD::FNEG , MVT::f32, Custom);
560 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
562 // Use ANDPS and ORPS to simulate FCOPYSIGN.
563 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
564 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
566 // We don't support sin/cos/fmod
567 setOperationAction(ISD::FSIN , MVT::f32, Expand);
568 setOperationAction(ISD::FCOS , MVT::f32, Expand);
569 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
571 // Special cases we handle for FP constants.
572 addLegalFPImmediate(APFloat(+0.0f)); // xorps
573 addLegalFPImmediate(APFloat(+0.0)); // FLD0
574 addLegalFPImmediate(APFloat(+1.0)); // FLD1
575 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
576 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
578 // Always expand sin/cos functions even though x87 has an instruction.
579 setOperationAction(ISD::FSIN , MVT::f64, Expand);
580 setOperationAction(ISD::FCOS , MVT::f64, Expand);
581 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
583 // f32 and f64 in x87.
584 // Set up the FP register classes.
585 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
586 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
588 for (auto VT : { MVT::f32, MVT::f64 }) {
589 setOperationAction(ISD::UNDEF, VT, Expand);
590 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
592 // Always expand sin/cos functions even though x87 has an instruction.
593 setOperationAction(ISD::FSIN , VT, Expand);
594 setOperationAction(ISD::FCOS , VT, Expand);
595 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 // Always expand sin/cos functions even though x87 has an instruction.
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
644 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
645 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
646 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
647 setOperationAction(ISD::FRINT, MVT::f80, Expand);
648 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
649 setOperationAction(ISD::FMA, MVT::f80, Expand);
652 // Always use a library call for pow.
653 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
654 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
657 setOperationAction(ISD::FLOG, MVT::f80, Expand);
658 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
660 setOperationAction(ISD::FEXP, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
662 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
663 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
665 // Some FP actions are always expanded for vector types.
666 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
667 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
668 setOperationAction(ISD::FSIN, VT, Expand);
669 setOperationAction(ISD::FSINCOS, VT, Expand);
670 setOperationAction(ISD::FCOS, VT, Expand);
671 setOperationAction(ISD::FREM, VT, Expand);
672 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
673 setOperationAction(ISD::FPOW, VT, Expand);
674 setOperationAction(ISD::FLOG, VT, Expand);
675 setOperationAction(ISD::FLOG2, VT, Expand);
676 setOperationAction(ISD::FLOG10, VT, Expand);
677 setOperationAction(ISD::FEXP, VT, Expand);
678 setOperationAction(ISD::FEXP2, VT, Expand);
681 // First set operation action for all vector types to either promote
682 // (for widening) or expand (for scalarization). Then we will selectively
683 // turn on ones that can be effectively codegen'd.
684 for (MVT VT : MVT::vector_valuetypes()) {
685 setOperationAction(ISD::SDIV, VT, Expand);
686 setOperationAction(ISD::UDIV, VT, Expand);
687 setOperationAction(ISD::SREM, VT, Expand);
688 setOperationAction(ISD::UREM, VT, Expand);
689 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
691 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
692 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::FMA, VT, Expand);
694 setOperationAction(ISD::FFLOOR, VT, Expand);
695 setOperationAction(ISD::FCEIL, VT, Expand);
696 setOperationAction(ISD::FTRUNC, VT, Expand);
697 setOperationAction(ISD::FRINT, VT, Expand);
698 setOperationAction(ISD::FNEARBYINT, VT, Expand);
699 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
700 setOperationAction(ISD::MULHS, VT, Expand);
701 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHU, VT, Expand);
703 setOperationAction(ISD::SDIVREM, VT, Expand);
704 setOperationAction(ISD::UDIVREM, VT, Expand);
705 setOperationAction(ISD::CTPOP, VT, Expand);
706 setOperationAction(ISD::CTTZ, VT, Expand);
707 setOperationAction(ISD::CTLZ, VT, Expand);
708 setOperationAction(ISD::ROTL, VT, Expand);
709 setOperationAction(ISD::ROTR, VT, Expand);
710 setOperationAction(ISD::BSWAP, VT, Expand);
711 setOperationAction(ISD::SETCC, VT, Expand);
712 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
713 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
714 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
715 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
717 setOperationAction(ISD::TRUNCATE, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
719 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
720 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
721 setOperationAction(ISD::SELECT_CC, VT, Expand);
722 for (MVT InnerVT : MVT::vector_valuetypes()) {
723 setTruncStoreAction(InnerVT, VT, Expand);
725 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
726 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
728 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
729 // types, we have to deal with them whether we ask for Expansion or not.
730 // Setting Expand causes its own optimisation problems though, so leave
732 if (VT.getVectorElementType() == MVT::i1)
733 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
735 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
736 // split/scalarized right now.
737 if (VT.getVectorElementType() == MVT::f16)
738 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
742 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
743 // with -msoft-float, disable use of MMX as well.
744 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
745 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
746 // No operations on x86mmx supported, everything uses intrinsics.
749 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
750 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
753 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
754 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
755 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
756 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
757 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
758 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
759 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
760 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
764 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
765 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
768 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
769 // registers cannot be used even for integer operations.
770 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
771 : &X86::VR128RegClass);
772 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
779 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
780 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
781 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
782 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
783 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
785 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
787 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
788 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
789 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
790 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
791 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
793 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
794 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
795 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
796 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
798 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
802 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
803 setOperationAction(ISD::SETCC, VT, Custom);
804 setOperationAction(ISD::CTPOP, VT, Custom);
805 setOperationAction(ISD::CTTZ, VT, Custom);
808 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
809 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
810 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
812 setOperationAction(ISD::VSELECT, VT, Custom);
813 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
816 // We support custom legalizing of sext and anyext loads for specific
817 // memory vector types which we can load as a scalar (or sequence of
818 // scalars) and extend in-register to a legal 128-bit vector type. For sext
819 // loads these must work with a single scalar load.
820 for (MVT VT : MVT::integer_vector_valuetypes()) {
821 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
824 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
832 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
833 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
834 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
835 setOperationAction(ISD::VSELECT, VT, Custom);
837 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
840 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
841 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
844 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
845 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
846 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
847 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
853 // Custom lower v2i64 and v2f64 selects.
854 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
855 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
857 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
860 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
863 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
865 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
868 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
869 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
871 for (MVT VT : MVT::fp_vector_valuetypes())
872 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
874 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
875 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
876 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
878 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
879 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
880 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
882 // In the customized shift lowering, the legal v4i32/v2i64 cases
883 // in AVX2 will be recognized.
884 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
885 setOperationAction(ISD::SRL, VT, Custom);
886 setOperationAction(ISD::SHL, VT, Custom);
887 setOperationAction(ISD::SRA, VT, Custom);
891 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
892 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
893 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
894 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
895 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
896 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
897 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
898 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
899 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
902 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
903 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
905 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
906 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
907 setOperationAction(ISD::FRINT, RoundedTy, Legal);
908 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
911 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
915 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
916 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
917 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
918 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
920 // FIXME: Do we need to handle scalar-to-vector here?
921 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
923 // We directly match byte blends in the backend as they match the VSELECT
925 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
927 // SSE41 brings specific instructions for doing vector sign extend even in
928 // cases where we don't have SRA.
929 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
930 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
931 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
934 for (MVT VT : MVT::integer_vector_valuetypes()) {
935 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
936 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
937 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
940 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
942 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
943 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
944 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
945 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
946 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
951 // i8 vectors are custom because the source register and source
952 // source memory operand types are not the same width.
953 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
956 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
957 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
958 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
959 setOperationAction(ISD::ROTL, VT, Custom);
961 // XOP can efficiently perform BITREVERSE with VPPERM.
962 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
963 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
966 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
967 setOperationAction(ISD::BITREVERSE, VT, Custom);
970 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
971 bool HasInt256 = Subtarget.hasInt256();
973 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
974 : &X86::VR256RegClass);
975 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
977 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
978 : &X86::VR256RegClass);
979 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980 : &X86::VR256RegClass);
981 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
986 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
987 setOperationAction(ISD::FFLOOR, VT, Legal);
988 setOperationAction(ISD::FCEIL, VT, Legal);
989 setOperationAction(ISD::FTRUNC, VT, Legal);
990 setOperationAction(ISD::FRINT, VT, Legal);
991 setOperationAction(ISD::FNEARBYINT, VT, Legal);
992 setOperationAction(ISD::FNEG, VT, Custom);
993 setOperationAction(ISD::FABS, VT, Custom);
994 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
997 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998 // even though v8i16 is a legal type.
999 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1000 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1001 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1003 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1004 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1006 for (MVT VT : MVT::fp_vector_valuetypes())
1007 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1009 // In the customized shift lowering, the legal v8i32/v4i64 cases
1010 // in AVX2 will be recognized.
1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012 setOperationAction(ISD::SRL, VT, Custom);
1013 setOperationAction(ISD::SHL, VT, Custom);
1014 setOperationAction(ISD::SRA, VT, Custom);
1017 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1021 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1022 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1023 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1024 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1027 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1028 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1029 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1030 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1032 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1033 setOperationAction(ISD::SETCC, VT, Custom);
1034 setOperationAction(ISD::CTPOP, VT, Custom);
1035 setOperationAction(ISD::CTTZ, VT, Custom);
1036 setOperationAction(ISD::CTLZ, VT, Custom);
1039 if (Subtarget.hasAnyFMA()) {
1040 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1041 MVT::v2f64, MVT::v4f64 })
1042 setOperationAction(ISD::FMA, VT, Legal);
1045 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1046 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1047 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1050 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1051 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1052 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1055 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1056 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1058 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1061 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1063 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1064 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1068 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1073 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1076 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1077 // when we have a 256bit-wide blend with immediate.
1078 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1080 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1081 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1082 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1083 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1084 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1085 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1086 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1087 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1091 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1092 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1093 setOperationAction(ISD::MLOAD, VT, Legal);
1094 setOperationAction(ISD::MSTORE, VT, Legal);
1097 // Extract subvector is special because the value type
1098 // (result) is 128-bit but the source is 256-bit wide.
1099 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1100 MVT::v4f32, MVT::v2f64 }) {
1101 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1104 // Custom lower several nodes for 256-bit types.
1105 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1106 MVT::v8f32, MVT::v4f64 }) {
1107 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1108 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1109 setOperationAction(ISD::VSELECT, VT, Custom);
1110 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1111 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1112 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1113 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1114 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1118 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1120 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1121 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1122 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1123 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1125 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1126 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1130 // Custom legalize 2x32 to get a little better code.
1131 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1132 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1134 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1135 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1136 setOperationAction(ISD::MGATHER, VT, Custom);
1140 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1141 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1142 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1143 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1144 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1146 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1147 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1148 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1150 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1151 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1152 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1154 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1155 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1156 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1157 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1158 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1159 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1160 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1161 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1163 // Extends of v16i1/v8i1 to 128-bit vectors.
1164 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1165 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
1166 setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom);
1167 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1168 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
1169 setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom);
1171 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1172 setOperationAction(ISD::ADD, VT, Custom);
1173 setOperationAction(ISD::SUB, VT, Custom);
1174 setOperationAction(ISD::MUL, VT, Custom);
1175 setOperationAction(ISD::SETCC, VT, Custom);
1176 setOperationAction(ISD::SELECT, VT, Custom);
1177 setOperationAction(ISD::TRUNCATE, VT, Custom);
1179 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1180 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1181 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1182 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1183 setOperationAction(ISD::VSELECT, VT, Expand);
1186 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1187 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1188 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1189 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1,
1190 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1191 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1193 for (MVT VT : MVT::fp_vector_valuetypes())
1194 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1196 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1197 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1198 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1199 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1200 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1201 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1204 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1205 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1206 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1207 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1208 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1209 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1210 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1211 setTruncStoreAction(VT, MaskVT, Custom);
1214 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1215 setOperationAction(ISD::FNEG, VT, Custom);
1216 setOperationAction(ISD::FABS, VT, Custom);
1217 setOperationAction(ISD::FMA, VT, Legal);
1218 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1221 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1222 setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote);
1223 setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote);
1224 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1225 setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote);
1226 setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote);
1227 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1228 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1230 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1231 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1232 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1233 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1234 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1236 if (!Subtarget.hasVLX()) {
1237 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1238 // to 512-bit rather than use the AVX2 instructions so that we can use
1240 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1241 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1242 setOperationAction(ISD::MLOAD, VT, Custom);
1243 setOperationAction(ISD::MSTORE, VT, Custom);
1247 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1248 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1249 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1250 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1251 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1252 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1253 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1254 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1256 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1257 setOperationAction(ISD::FFLOOR, VT, Legal);
1258 setOperationAction(ISD::FCEIL, VT, Legal);
1259 setOperationAction(ISD::FTRUNC, VT, Legal);
1260 setOperationAction(ISD::FRINT, VT, Legal);
1261 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1264 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1265 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1267 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1268 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1269 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1271 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1272 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1273 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1274 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1276 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1277 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1279 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1280 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1282 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1283 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1284 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1286 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1287 setOperationAction(ISD::SMAX, VT, Legal);
1288 setOperationAction(ISD::UMAX, VT, Legal);
1289 setOperationAction(ISD::SMIN, VT, Legal);
1290 setOperationAction(ISD::UMIN, VT, Legal);
1291 setOperationAction(ISD::ABS, VT, Legal);
1292 setOperationAction(ISD::SRL, VT, Custom);
1293 setOperationAction(ISD::SHL, VT, Custom);
1294 setOperationAction(ISD::SRA, VT, Custom);
1295 setOperationAction(ISD::CTPOP, VT, Custom);
1296 setOperationAction(ISD::CTTZ, VT, Custom);
1297 setOperationAction(ISD::ROTL, VT, Custom);
1298 setOperationAction(ISD::ROTR, VT, Custom);
1301 // Need to promote to 64-bit even though we have 32-bit masked instructions
1302 // because the IR optimizers rearrange bitcasts around logic ops leaving
1303 // too many variations to handle if we don't promote them.
1304 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1305 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1306 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1308 if (Subtarget.hasDQI()) {
1309 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1310 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1311 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1312 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1314 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1317 if (Subtarget.hasCDI()) {
1318 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1319 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1320 setOperationAction(ISD::CTLZ, VT, Legal);
1321 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1323 } // Subtarget.hasCDI()
1325 if (Subtarget.hasVPOPCNTDQ()) {
1326 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1327 setOperationAction(ISD::CTPOP, VT, Legal);
1330 // Extract subvector is special because the value type
1331 // (result) is 256-bit but the source is 512-bit wide.
1332 // 128-bit was made Legal under AVX1.
1333 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1334 MVT::v8f32, MVT::v4f64 })
1335 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1337 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1338 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1339 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1340 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1341 setOperationAction(ISD::VSELECT, VT, Custom);
1342 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1343 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1344 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1345 setOperationAction(ISD::MLOAD, VT, Legal);
1346 setOperationAction(ISD::MSTORE, VT, Legal);
1347 setOperationAction(ISD::MGATHER, VT, Custom);
1348 setOperationAction(ISD::MSCATTER, VT, Custom);
1350 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1351 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1352 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1356 if (!Subtarget.useSoftFloat() &&
1357 (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1358 // These operations are handled on non-VLX by artificially widening in
1360 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1362 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1363 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1364 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1365 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1366 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1368 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1369 setOperationAction(ISD::SMAX, VT, Legal);
1370 setOperationAction(ISD::UMAX, VT, Legal);
1371 setOperationAction(ISD::SMIN, VT, Legal);
1372 setOperationAction(ISD::UMIN, VT, Legal);
1373 setOperationAction(ISD::ABS, VT, Legal);
1376 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1377 setOperationAction(ISD::ROTL, VT, Custom);
1378 setOperationAction(ISD::ROTR, VT, Custom);
1381 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1382 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1383 setOperationAction(ISD::MSCATTER, VT, Custom);
1385 if (Subtarget.hasDQI()) {
1386 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1387 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1388 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1389 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1390 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1392 setOperationAction(ISD::MUL, VT, Legal);
1396 if (Subtarget.hasCDI()) {
1397 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1398 setOperationAction(ISD::CTLZ, VT, Legal);
1399 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1401 } // Subtarget.hasCDI()
1403 if (Subtarget.hasVPOPCNTDQ()) {
1404 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1405 setOperationAction(ISD::CTPOP, VT, Legal);
1409 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1410 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1411 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1413 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1414 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1416 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1417 setOperationAction(ISD::ADD, VT, Custom);
1418 setOperationAction(ISD::SUB, VT, Custom);
1419 setOperationAction(ISD::MUL, VT, Custom);
1420 setOperationAction(ISD::VSELECT, VT, Expand);
1422 setOperationAction(ISD::TRUNCATE, VT, Custom);
1423 setOperationAction(ISD::SETCC, VT, Custom);
1424 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1425 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1426 setOperationAction(ISD::SELECT, VT, Custom);
1427 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1428 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1431 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1432 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1433 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1434 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1436 // Extends from v32i1 masks to 256-bit vectors.
1437 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1438 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1439 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1440 // Extends from v64i1 masks to 512-bit vectors.
1441 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1442 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1443 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1445 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1446 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1447 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1448 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1449 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1450 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1451 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1452 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1453 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1454 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1455 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1456 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1457 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1458 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1459 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1460 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1461 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1462 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1463 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1464 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1465 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1466 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1467 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1469 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1471 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1473 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1474 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1475 setOperationAction(ISD::VSELECT, VT, Custom);
1476 setOperationAction(ISD::ABS, VT, Legal);
1477 setOperationAction(ISD::SRL, VT, Custom);
1478 setOperationAction(ISD::SHL, VT, Custom);
1479 setOperationAction(ISD::SRA, VT, Custom);
1480 setOperationAction(ISD::MLOAD, VT, Legal);
1481 setOperationAction(ISD::MSTORE, VT, Legal);
1482 setOperationAction(ISD::CTPOP, VT, Custom);
1483 setOperationAction(ISD::CTTZ, VT, Custom);
1484 setOperationAction(ISD::CTLZ, VT, Custom);
1485 setOperationAction(ISD::SMAX, VT, Legal);
1486 setOperationAction(ISD::UMAX, VT, Legal);
1487 setOperationAction(ISD::SMIN, VT, Legal);
1488 setOperationAction(ISD::UMIN, VT, Legal);
1490 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1491 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1492 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1495 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1496 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1499 if (Subtarget.hasBITALG()) {
1500 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1501 setOperationAction(ISD::CTPOP, VT, Legal);
1505 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
1506 (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1507 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1508 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1509 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1512 // These operations are handled on non-VLX by artificially widening in
1514 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1516 if (Subtarget.hasBITALG()) {
1517 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1518 setOperationAction(ISD::CTPOP, VT, Legal);
1522 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1523 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1524 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1526 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1527 setOperationAction(ISD::ADD, VT, Custom);
1528 setOperationAction(ISD::SUB, VT, Custom);
1529 setOperationAction(ISD::MUL, VT, Custom);
1530 setOperationAction(ISD::VSELECT, VT, Expand);
1532 setOperationAction(ISD::TRUNCATE, VT, Custom);
1533 setOperationAction(ISD::SETCC, VT, Custom);
1534 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1535 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1536 setOperationAction(ISD::SELECT, VT, Custom);
1537 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1538 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1541 // TODO: v8i1 concat should be legal without VLX to support concats of
1542 // v1i1, but we won't legalize it correctly currently without introducing
1543 // a v4i1 concat in the middle.
1544 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1545 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1546 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1548 // Extends from v2i1/v4i1 masks to 128-bit vectors.
1549 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1550 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1551 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1552 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1553 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom);
1554 setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom);
1556 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1557 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1558 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1559 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1560 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1562 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1563 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1564 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1565 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1566 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1568 if (Subtarget.hasDQI()) {
1569 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1570 // v2f32 UINT_TO_FP is already custom under SSE2.
1571 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1572 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1573 "Unexpected operation action!");
1574 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1575 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1576 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1579 if (Subtarget.hasBWI()) {
1580 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1581 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1585 // We want to custom lower some of our intrinsics.
1586 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1587 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1588 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1589 if (!Subtarget.is64Bit()) {
1590 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1591 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1594 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1595 // handle type legalization for these operations here.
1597 // FIXME: We really should do custom legalization for addition and
1598 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1599 // than generic legalization for 64-bit multiplication-with-overflow, though.
1600 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1601 if (VT == MVT::i64 && !Subtarget.is64Bit())
1603 // Add/Sub/Mul with overflow operations are custom lowered.
1604 setOperationAction(ISD::SADDO, VT, Custom);
1605 setOperationAction(ISD::UADDO, VT, Custom);
1606 setOperationAction(ISD::SSUBO, VT, Custom);
1607 setOperationAction(ISD::USUBO, VT, Custom);
1608 setOperationAction(ISD::SMULO, VT, Custom);
1609 setOperationAction(ISD::UMULO, VT, Custom);
1611 // Support carry in as value rather than glue.
1612 setOperationAction(ISD::ADDCARRY, VT, Custom);
1613 setOperationAction(ISD::SUBCARRY, VT, Custom);
1614 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1617 if (!Subtarget.is64Bit()) {
1618 // These libcalls are not available in 32-bit.
1619 setLibcallName(RTLIB::SHL_I128, nullptr);
1620 setLibcallName(RTLIB::SRL_I128, nullptr);
1621 setLibcallName(RTLIB::SRA_I128, nullptr);
1622 setLibcallName(RTLIB::MUL_I128, nullptr);
1625 // Combine sin / cos into one node or libcall if possible.
1626 if (Subtarget.hasSinCos()) {
1627 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1628 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1629 if (Subtarget.isTargetDarwin()) {
1630 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1631 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1632 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1633 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1637 if (Subtarget.isTargetWin64()) {
1638 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1639 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1640 setOperationAction(ISD::SREM, MVT::i128, Custom);
1641 setOperationAction(ISD::UREM, MVT::i128, Custom);
1642 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1643 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1646 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1647 // is. We should promote the value to 64-bits to solve this.
1648 // This is what the CRT headers do - `fmodf` is an inline header
1649 // function casting to f64 and calling `fmod`.
1650 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1651 Subtarget.isTargetWindowsItanium()))
1652 for (ISD::NodeType Op :
1653 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1654 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1655 if (isOperationExpand(Op, MVT::f32))
1656 setOperationAction(Op, MVT::f32, Promote);
1658 // We have target-specific dag combine patterns for the following nodes:
1659 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1660 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1661 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1662 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1663 setTargetDAGCombine(ISD::BITCAST);
1664 setTargetDAGCombine(ISD::VSELECT);
1665 setTargetDAGCombine(ISD::SELECT);
1666 setTargetDAGCombine(ISD::SHL);
1667 setTargetDAGCombine(ISD::SRA);
1668 setTargetDAGCombine(ISD::SRL);
1669 setTargetDAGCombine(ISD::OR);
1670 setTargetDAGCombine(ISD::AND);
1671 setTargetDAGCombine(ISD::ADD);
1672 setTargetDAGCombine(ISD::FADD);
1673 setTargetDAGCombine(ISD::FSUB);
1674 setTargetDAGCombine(ISD::FNEG);
1675 setTargetDAGCombine(ISD::FMA);
1676 setTargetDAGCombine(ISD::FMINNUM);
1677 setTargetDAGCombine(ISD::FMAXNUM);
1678 setTargetDAGCombine(ISD::SUB);
1679 setTargetDAGCombine(ISD::LOAD);
1680 setTargetDAGCombine(ISD::MLOAD);
1681 setTargetDAGCombine(ISD::STORE);
1682 setTargetDAGCombine(ISD::MSTORE);
1683 setTargetDAGCombine(ISD::TRUNCATE);
1684 setTargetDAGCombine(ISD::ZERO_EXTEND);
1685 setTargetDAGCombine(ISD::ANY_EXTEND);
1686 setTargetDAGCombine(ISD::SIGN_EXTEND);
1687 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1688 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1689 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1690 setTargetDAGCombine(ISD::SINT_TO_FP);
1691 setTargetDAGCombine(ISD::UINT_TO_FP);
1692 setTargetDAGCombine(ISD::SETCC);
1693 setTargetDAGCombine(ISD::MUL);
1694 setTargetDAGCombine(ISD::XOR);
1695 setTargetDAGCombine(ISD::MSCATTER);
1696 setTargetDAGCombine(ISD::MGATHER);
1698 computeRegisterProperties(Subtarget.getRegisterInfo());
1700 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1701 MaxStoresPerMemsetOptSize = 8;
1702 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1703 MaxStoresPerMemcpyOptSize = 4;
1704 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1705 MaxStoresPerMemmoveOptSize = 4;
1707 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1708 // that needs to benchmarked and balanced with the potential use of vector
1709 // load/store types (PR33329, PR33914).
1710 MaxLoadsPerMemcmp = 2;
1711 MaxLoadsPerMemcmpOptSize = 2;
1713 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1714 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1716 // An out-of-order CPU can speculatively execute past a predictable branch,
1717 // but a conditional move could be stalled by an expensive earlier operation.
1718 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1719 EnableExtLdPromotion = true;
1720 setPrefFunctionAlignment(4); // 2^4 bytes.
1722 verifyIntrinsicTables();
1725 // This has so far only been implemented for 64-bit MachO.
1726 bool X86TargetLowering::useLoadStackGuardNode() const {
1727 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1730 bool X86TargetLowering::useStackGuardXorFP() const {
1731 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1732 return Subtarget.getTargetTriple().isOSMSVCRT();
1735 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1736 const SDLoc &DL) const {
1737 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1738 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1739 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1740 return SDValue(Node, 0);
1743 TargetLoweringBase::LegalizeTypeAction
1744 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1745 if (ExperimentalVectorWideningLegalization &&
1746 VT.getVectorNumElements() != 1 &&
1747 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1748 return TypeWidenVector;
1750 return TargetLoweringBase::getPreferredVectorAction(VT);
1753 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1754 LLVMContext& Context,
1759 if (Subtarget.hasAVX512()) {
1760 const unsigned NumElts = VT.getVectorNumElements();
1762 // Figure out what this type will be legalized to.
1764 while (getTypeAction(Context, LegalVT) != TypeLegal)
1765 LegalVT = getTypeToTransformTo(Context, LegalVT);
1767 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1768 if (LegalVT.getSimpleVT().is512BitVector())
1769 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1771 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1772 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1773 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1775 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1776 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1777 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1781 return VT.changeVectorElementTypeToInteger();
1784 /// Helper for getByValTypeAlignment to determine
1785 /// the desired ByVal argument alignment.
1786 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1789 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1790 if (VTy->getBitWidth() == 128)
1792 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1793 unsigned EltAlign = 0;
1794 getMaxByValAlign(ATy->getElementType(), EltAlign);
1795 if (EltAlign > MaxAlign)
1796 MaxAlign = EltAlign;
1797 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1798 for (auto *EltTy : STy->elements()) {
1799 unsigned EltAlign = 0;
1800 getMaxByValAlign(EltTy, EltAlign);
1801 if (EltAlign > MaxAlign)
1802 MaxAlign = EltAlign;
1809 /// Return the desired alignment for ByVal aggregate
1810 /// function arguments in the caller parameter area. For X86, aggregates
1811 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1812 /// are at 4-byte boundaries.
1813 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1814 const DataLayout &DL) const {
1815 if (Subtarget.is64Bit()) {
1816 // Max of 8 and alignment of type.
1817 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1824 if (Subtarget.hasSSE1())
1825 getMaxByValAlign(Ty, Align);
1829 /// Returns the target specific optimal type for load
1830 /// and store operations as a result of memset, memcpy, and memmove
1831 /// lowering. If DstAlign is zero that means it's safe to destination
1832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833 /// means there isn't a need to check it against alignment requirement,
1834 /// probably because the source does not need to be loaded. If 'IsMemset' is
1835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837 /// source is constant so it does not need to be loaded.
1838 /// It returns EVT::Other if the type should be determined using generic
1839 /// target-independent logic.
1841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842 unsigned DstAlign, unsigned SrcAlign,
1843 bool IsMemset, bool ZeroMemset,
1845 MachineFunction &MF) const {
1846 const Function &F = MF.getFunction();
1847 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1849 (!Subtarget.isUnalignedMem16Slow() ||
1850 ((DstAlign == 0 || DstAlign >= 16) &&
1851 (SrcAlign == 0 || SrcAlign >= 16)))) {
1852 // FIXME: Check if unaligned 32-byte accesses are slow.
1853 if (Size >= 32 && Subtarget.hasAVX()) {
1854 // Although this isn't a well-supported type for AVX1, we'll let
1855 // legalization and shuffle lowering produce the optimal codegen. If we
1856 // choose an optimal type with a vector element larger than a byte,
1857 // getMemsetStores() may create an intermediate splat (using an integer
1858 // multiply) before we splat as a vector.
1861 if (Subtarget.hasSSE2())
1863 // TODO: Can SSE1 handle a byte vector?
1864 if (Subtarget.hasSSE1())
1866 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1867 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1868 // Do not use f64 to lower memcpy if source is string constant. It's
1869 // better to use i32 to avoid the loads.
1870 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1871 // The gymnastics of splatting a byte value into an XMM register and then
1872 // only using 8-byte stores (because this is a CPU with slow unaligned
1873 // 16-byte accesses) makes that a loser.
1877 // This is a compromise. If we reach here, unaligned accesses may be slow on
1878 // this target. However, creating smaller, aligned accesses could be even
1879 // slower and would certainly be a lot more code.
1880 if (Subtarget.is64Bit() && Size >= 8)
1885 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1887 return X86ScalarSSEf32;
1888 else if (VT == MVT::f64)
1889 return X86ScalarSSEf64;
1894 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1899 switch (VT.getSizeInBits()) {
1901 // 8-byte and under are always assumed to be fast.
1905 *Fast = !Subtarget.isUnalignedMem16Slow();
1908 *Fast = !Subtarget.isUnalignedMem32Slow();
1910 // TODO: What about AVX-512 (512-bit) accesses?
1913 // Misaligned accesses of any size are always allowed.
1917 /// Return the entry encoding for a jump table in the
1918 /// current function. The returned value is a member of the
1919 /// MachineJumpTableInfo::JTEntryKind enum.
1920 unsigned X86TargetLowering::getJumpTableEncoding() const {
1921 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1923 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1924 return MachineJumpTableInfo::EK_Custom32;
1926 // Otherwise, use the normal jump table encoding heuristics.
1927 return TargetLowering::getJumpTableEncoding();
1930 bool X86TargetLowering::useSoftFloat() const {
1931 return Subtarget.useSoftFloat();
1934 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1935 ArgListTy &Args) const {
1937 // Only relabel X86-32 for C / Stdcall CCs.
1938 if (Subtarget.is64Bit())
1940 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1942 unsigned ParamRegs = 0;
1943 if (auto *M = MF->getFunction().getParent())
1944 ParamRegs = M->getNumberRegisterParameters();
1946 // Mark the first N int arguments as having reg
1947 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1948 Type *T = Args[Idx].Ty;
1949 if (T->isPointerTy() || T->isIntegerTy())
1950 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1951 unsigned numRegs = 1;
1952 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1954 if (ParamRegs < numRegs)
1956 ParamRegs -= numRegs;
1957 Args[Idx].IsInReg = true;
1963 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1964 const MachineBasicBlock *MBB,
1965 unsigned uid,MCContext &Ctx) const{
1966 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1967 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1969 return MCSymbolRefExpr::create(MBB->getSymbol(),
1970 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1973 /// Returns relocation base for the given PIC jumptable.
1974 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1975 SelectionDAG &DAG) const {
1976 if (!Subtarget.is64Bit())
1977 // This doesn't have SDLoc associated with it, but is not really the
1978 // same as a Register.
1979 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1980 getPointerTy(DAG.getDataLayout()));
1984 /// This returns the relocation base for the given PIC jumptable,
1985 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1986 const MCExpr *X86TargetLowering::
1987 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1988 MCContext &Ctx) const {
1989 // X86-64 uses RIP relative addressing based on the jump table label.
1990 if (Subtarget.isPICStyleRIPRel())
1991 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1993 // Otherwise, the reference is relative to the PIC base.
1994 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1997 std::pair<const TargetRegisterClass *, uint8_t>
1998 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2000 const TargetRegisterClass *RRC = nullptr;
2002 switch (VT.SimpleTy) {
2004 return TargetLowering::findRepresentativeClass(TRI, VT);
2005 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2006 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2009 RRC = &X86::VR64RegClass;
2011 case MVT::f32: case MVT::f64:
2012 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2013 case MVT::v4f32: case MVT::v2f64:
2014 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2015 case MVT::v8f32: case MVT::v4f64:
2016 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2017 case MVT::v16f32: case MVT::v8f64:
2018 RRC = &X86::VR128XRegClass;
2021 return std::make_pair(RRC, Cost);
2024 unsigned X86TargetLowering::getAddressSpace() const {
2025 if (Subtarget.is64Bit())
2026 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2030 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2031 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2032 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2035 static Constant* SegmentOffset(IRBuilder<> &IRB,
2036 unsigned Offset, unsigned AddressSpace) {
2037 return ConstantExpr::getIntToPtr(
2038 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2039 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2042 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2043 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2044 // tcbhead_t; use it instead of the usual global variable (see
2045 // sysdeps/{i386,x86_64}/nptl/tls.h)
2046 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2047 if (Subtarget.isTargetFuchsia()) {
2048 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2049 return SegmentOffset(IRB, 0x10, getAddressSpace());
2051 // %fs:0x28, unless we're using a Kernel code model, in which case
2052 // it's %gs:0x28. gs:0x14 on i386.
2053 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2054 return SegmentOffset(IRB, Offset, getAddressSpace());
2058 return TargetLowering::getIRStackGuard(IRB);
2061 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2062 // MSVC CRT provides functionalities for stack protection.
2063 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2064 // MSVC CRT has a global variable holding security cookie.
2065 M.getOrInsertGlobal("__security_cookie",
2066 Type::getInt8PtrTy(M.getContext()));
2068 // MSVC CRT has a function to validate security cookie.
2069 auto *SecurityCheckCookie = cast<Function>(
2070 M.getOrInsertFunction("__security_check_cookie",
2071 Type::getVoidTy(M.getContext()),
2072 Type::getInt8PtrTy(M.getContext())));
2073 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2074 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2077 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2078 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2080 TargetLowering::insertSSPDeclarations(M);
2083 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2084 // MSVC CRT has a global variable holding security cookie.
2085 if (Subtarget.getTargetTriple().isOSMSVCRT())
2086 return M.getGlobalVariable("__security_cookie");
2087 return TargetLowering::getSDagStackGuard(M);
2090 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2091 // MSVC CRT has a function to validate security cookie.
2092 if (Subtarget.getTargetTriple().isOSMSVCRT())
2093 return M.getFunction("__security_check_cookie");
2094 return TargetLowering::getSSPStackGuardCheck(M);
2097 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2098 if (Subtarget.getTargetTriple().isOSContiki())
2099 return getDefaultSafeStackPointerLocation(IRB, false);
2101 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2102 // definition of TLS_SLOT_SAFESTACK in
2103 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2104 if (Subtarget.isTargetAndroid()) {
2105 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2107 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2108 return SegmentOffset(IRB, Offset, getAddressSpace());
2111 // Fuchsia is similar.
2112 if (Subtarget.isTargetFuchsia()) {
2113 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2114 return SegmentOffset(IRB, 0x18, getAddressSpace());
2117 return TargetLowering::getSafeStackPointerLocation(IRB);
2120 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2121 unsigned DestAS) const {
2122 assert(SrcAS != DestAS && "Expected different address spaces!");
2124 return SrcAS < 256 && DestAS < 256;
2127 //===----------------------------------------------------------------------===//
2128 // Return Value Calling Convention Implementation
2129 //===----------------------------------------------------------------------===//
2131 #include "X86GenCallingConv.inc"
2133 bool X86TargetLowering::CanLowerReturn(
2134 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2135 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2136 SmallVector<CCValAssign, 16> RVLocs;
2137 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2138 return CCInfo.CheckReturn(Outs, RetCC_X86);
2141 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2142 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2146 /// Lowers masks values (v*i1) to the local register values
2147 /// \returns DAG node after lowering to register type
2148 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2149 const SDLoc &Dl, SelectionDAG &DAG) {
2150 EVT ValVT = ValArg.getValueType();
2152 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2153 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2154 // Two stage lowering might be required
2155 // bitcast: v8i1 -> i8 / v16i1 -> i16
2156 // anyextend: i8 -> i32 / i16 -> i32
2157 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2158 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2159 if (ValLoc == MVT::i32)
2160 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2162 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2163 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2164 // One stage lowering is required
2165 // bitcast: v32i1 -> i32 / v64i1 -> i64
2166 return DAG.getBitcast(ValLoc, ValArg);
2168 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2171 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2172 static void Passv64i1ArgInRegs(
2173 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2174 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2175 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2176 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2177 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2178 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2179 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2180 "The value should reside in two registers");
2182 // Before splitting the value we cast it to i64
2183 Arg = DAG.getBitcast(MVT::i64, Arg);
2185 // Splitting the value into two i32 types
2187 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2188 DAG.getConstant(0, Dl, MVT::i32));
2189 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2190 DAG.getConstant(1, Dl, MVT::i32));
2192 // Attach the two i32 types into corresponding registers
2193 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2194 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2198 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2200 const SmallVectorImpl<ISD::OutputArg> &Outs,
2201 const SmallVectorImpl<SDValue> &OutVals,
2202 const SDLoc &dl, SelectionDAG &DAG) const {
2203 MachineFunction &MF = DAG.getMachineFunction();
2204 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2206 // In some cases we need to disable registers from the default CSR list.
2207 // For example, when they are used for argument passing.
2208 bool ShouldDisableCalleeSavedRegister =
2209 CallConv == CallingConv::X86_RegCall ||
2210 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2212 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2213 report_fatal_error("X86 interrupts may not return any value");
2215 SmallVector<CCValAssign, 16> RVLocs;
2216 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2217 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2220 SmallVector<SDValue, 6> RetOps;
2221 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2222 // Operand #1 = Bytes To Pop
2223 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2226 // Copy the result values into the output registers.
2227 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2229 CCValAssign &VA = RVLocs[I];
2230 assert(VA.isRegLoc() && "Can only return in registers!");
2232 // Add the register to the CalleeSaveDisableRegs list.
2233 if (ShouldDisableCalleeSavedRegister)
2234 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2236 SDValue ValToCopy = OutVals[OutsIndex];
2237 EVT ValVT = ValToCopy.getValueType();
2239 // Promote values to the appropriate types.
2240 if (VA.getLocInfo() == CCValAssign::SExt)
2241 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2242 else if (VA.getLocInfo() == CCValAssign::ZExt)
2243 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2244 else if (VA.getLocInfo() == CCValAssign::AExt) {
2245 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2246 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2248 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2250 else if (VA.getLocInfo() == CCValAssign::BCvt)
2251 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2253 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2254 "Unexpected FP-extend for return value.");
2256 // If this is x86-64, and we disabled SSE, we can't return FP values,
2257 // or SSE or MMX vectors.
2258 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2259 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2260 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2261 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2262 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2263 } else if (ValVT == MVT::f64 &&
2264 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2265 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2266 // llvm-gcc has never done it right and no one has noticed, so this
2267 // should be OK for now.
2268 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2269 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2272 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2273 // the RET instruction and handled by the FP Stackifier.
2274 if (VA.getLocReg() == X86::FP0 ||
2275 VA.getLocReg() == X86::FP1) {
2276 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2277 // change the value to the FP stack register class.
2278 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2279 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2280 RetOps.push_back(ValToCopy);
2281 // Don't emit a copytoreg.
2285 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2286 // which is returned in RAX / RDX.
2287 if (Subtarget.is64Bit()) {
2288 if (ValVT == MVT::x86mmx) {
2289 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2290 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2291 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2293 // If we don't have SSE2 available, convert to v4f32 so the generated
2294 // register is legal.
2295 if (!Subtarget.hasSSE2())
2296 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2301 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2303 if (VA.needsCustom()) {
2304 assert(VA.getValVT() == MVT::v64i1 &&
2305 "Currently the only custom case is when we split v64i1 to 2 regs");
2307 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2310 assert(2 == RegsToPass.size() &&
2311 "Expecting two registers after Pass64BitArgInRegs");
2313 // Add the second register to the CalleeSaveDisableRegs list.
2314 if (ShouldDisableCalleeSavedRegister)
2315 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2317 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2320 // Add nodes to the DAG and add the values into the RetOps list
2321 for (auto &Reg : RegsToPass) {
2322 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2323 Flag = Chain.getValue(1);
2324 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2328 // Swift calling convention does not require we copy the sret argument
2329 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2331 // All x86 ABIs require that for returning structs by value we copy
2332 // the sret argument into %rax/%eax (depending on ABI) for the return.
2333 // We saved the argument into a virtual register in the entry block,
2334 // so now we copy the value out and into %rax/%eax.
2336 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2337 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2338 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2339 // either case FuncInfo->setSRetReturnReg() will have been called.
2340 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2341 // When we have both sret and another return value, we should use the
2342 // original Chain stored in RetOps[0], instead of the current Chain updated
2343 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2345 // For the case of sret and another return value, we have
2346 // Chain_0 at the function entry
2347 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2348 // If we use Chain_1 in getCopyFromReg, we will have
2349 // Val = getCopyFromReg(Chain_1)
2350 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2352 // getCopyToReg(Chain_0) will be glued together with
2353 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2354 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2355 // Data dependency from Unit B to Unit A due to usage of Val in
2356 // getCopyToReg(Chain_1, Val)
2357 // Chain dependency from Unit A to Unit B
2359 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2360 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2361 getPointerTy(MF.getDataLayout()));
2364 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2365 X86::RAX : X86::EAX;
2366 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2367 Flag = Chain.getValue(1);
2369 // RAX/EAX now acts like a return value.
2371 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2373 // Add the returned register to the CalleeSaveDisableRegs list.
2374 if (ShouldDisableCalleeSavedRegister)
2375 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2378 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2379 const MCPhysReg *I =
2380 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2383 if (X86::GR64RegClass.contains(*I))
2384 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2386 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2390 RetOps[0] = Chain; // Update chain.
2392 // Add the flag if we have it.
2394 RetOps.push_back(Flag);
2396 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2397 if (CallConv == CallingConv::X86_INTR)
2398 opcode = X86ISD::IRET;
2399 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2402 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2403 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2406 SDValue TCChain = Chain;
2407 SDNode *Copy = *N->use_begin();
2408 if (Copy->getOpcode() == ISD::CopyToReg) {
2409 // If the copy has a glue operand, we conservatively assume it isn't safe to
2410 // perform a tail call.
2411 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2413 TCChain = Copy->getOperand(0);
2414 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2417 bool HasRet = false;
2418 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2420 if (UI->getOpcode() != X86ISD::RET_FLAG)
2422 // If we are returning more than one value, we can definitely
2423 // not make a tail call see PR19530
2424 if (UI->getNumOperands() > 4)
2426 if (UI->getNumOperands() == 4 &&
2427 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2439 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2440 ISD::NodeType ExtendKind) const {
2441 MVT ReturnMVT = MVT::i32;
2443 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2444 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2445 // The ABI does not require i1, i8 or i16 to be extended.
2447 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2448 // always extending i8/i16 return values, so keep doing that for now.
2450 ReturnMVT = MVT::i8;
2453 EVT MinVT = getRegisterType(Context, ReturnMVT);
2454 return VT.bitsLT(MinVT) ? MinVT : VT;
2457 /// Reads two 32 bit registers and creates a 64 bit mask value.
2458 /// \param VA The current 32 bit value that need to be assigned.
2459 /// \param NextVA The next 32 bit value that need to be assigned.
2460 /// \param Root The parent DAG node.
2461 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2462 /// glue purposes. In the case the DAG is already using
2463 /// physical register instead of virtual, we should glue
2464 /// our new SDValue to InFlag SDvalue.
2465 /// \return a new SDvalue of size 64bit.
2466 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2467 SDValue &Root, SelectionDAG &DAG,
2468 const SDLoc &Dl, const X86Subtarget &Subtarget,
2469 SDValue *InFlag = nullptr) {
2470 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2471 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2472 assert(VA.getValVT() == MVT::v64i1 &&
2473 "Expecting first location of 64 bit width type");
2474 assert(NextVA.getValVT() == VA.getValVT() &&
2475 "The locations should have the same type");
2476 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2477 "The values should reside in two registers");
2481 SDValue ArgValueLo, ArgValueHi;
2483 MachineFunction &MF = DAG.getMachineFunction();
2484 const TargetRegisterClass *RC = &X86::GR32RegClass;
2486 // Read a 32 bit value from the registers
2487 if (nullptr == InFlag) {
2488 // When no physical register is present,
2489 // create an intermediate virtual register
2490 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2491 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2492 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2493 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2495 // When a physical register is available read the value from it and glue
2496 // the reads together.
2498 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2499 *InFlag = ArgValueLo.getValue(2);
2501 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2502 *InFlag = ArgValueHi.getValue(2);
2505 // Convert the i32 type into v32i1 type
2506 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2508 // Convert the i32 type into v32i1 type
2509 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2511 // Concatenate the two values together
2512 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2515 /// The function will lower a register of various sizes (8/16/32/64)
2516 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2517 /// \returns a DAG node contains the operand after lowering to mask type.
2518 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2519 const EVT &ValLoc, const SDLoc &Dl,
2520 SelectionDAG &DAG) {
2521 SDValue ValReturned = ValArg;
2523 if (ValVT == MVT::v1i1)
2524 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2526 if (ValVT == MVT::v64i1) {
2527 // In 32 bit machine, this case is handled by getv64i1Argument
2528 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2529 // In 64 bit machine, There is no need to truncate the value only bitcast
2532 switch (ValVT.getSimpleVT().SimpleTy) {
2543 llvm_unreachable("Expecting a vector of i1 types");
2546 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2548 return DAG.getBitcast(ValVT, ValReturned);
2551 /// Lower the result values of a call into the
2552 /// appropriate copies out of appropriate physical registers.
2554 SDValue X86TargetLowering::LowerCallResult(
2555 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2556 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2557 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2558 uint32_t *RegMask) const {
2560 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2561 // Assign locations to each value returned by this call.
2562 SmallVector<CCValAssign, 16> RVLocs;
2563 bool Is64Bit = Subtarget.is64Bit();
2564 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2566 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2568 // Copy all of the result registers out of their specified physreg.
2569 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2571 CCValAssign &VA = RVLocs[I];
2572 EVT CopyVT = VA.getLocVT();
2574 // In some calling conventions we need to remove the used registers
2575 // from the register mask.
2577 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2578 SubRegs.isValid(); ++SubRegs)
2579 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2582 // If this is x86-64, and we disabled SSE, we can't return FP values
2583 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2584 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2585 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2586 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2589 // If we prefer to use the value in xmm registers, copy it out as f80 and
2590 // use a truncate to move it from fp stack reg to xmm reg.
2591 bool RoundAfterCopy = false;
2592 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2593 isScalarFPTypeInSSEReg(VA.getValVT())) {
2594 if (!Subtarget.hasX87())
2595 report_fatal_error("X87 register return with X87 disabled");
2597 RoundAfterCopy = (CopyVT != VA.getLocVT());
2601 if (VA.needsCustom()) {
2602 assert(VA.getValVT() == MVT::v64i1 &&
2603 "Currently the only custom case is when we split v64i1 to 2 regs");
2605 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2607 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2609 Val = Chain.getValue(0);
2610 InFlag = Chain.getValue(2);
2614 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2615 // This truncation won't change the value.
2616 DAG.getIntPtrConstant(1, dl));
2618 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2619 if (VA.getValVT().isVector() &&
2620 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2621 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2622 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2623 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2625 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2628 InVals.push_back(Val);
2634 //===----------------------------------------------------------------------===//
2635 // C & StdCall & Fast Calling Convention implementation
2636 //===----------------------------------------------------------------------===//
2637 // StdCall calling convention seems to be standard for many Windows' API
2638 // routines and around. It differs from C calling convention just a little:
2639 // callee should clean up the stack, not caller. Symbols should be also
2640 // decorated in some fancy way :) It doesn't support any vector arguments.
2641 // For info on fast calling convention see Fast Calling Convention (tail call)
2642 // implementation LowerX86_32FastCCCallTo.
2644 /// CallIsStructReturn - Determines whether a call uses struct return
2646 enum StructReturnType {
2651 static StructReturnType
2652 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2654 return NotStructReturn;
2656 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2657 if (!Flags.isSRet())
2658 return NotStructReturn;
2659 if (Flags.isInReg() || IsMCU)
2660 return RegStructReturn;
2661 return StackStructReturn;
2664 /// Determines whether a function uses struct return semantics.
2665 static StructReturnType
2666 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2668 return NotStructReturn;
2670 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2671 if (!Flags.isSRet())
2672 return NotStructReturn;
2673 if (Flags.isInReg() || IsMCU)
2674 return RegStructReturn;
2675 return StackStructReturn;
2678 /// Make a copy of an aggregate at address specified by "Src" to address
2679 /// "Dst" with size and alignment information specified by the specific
2680 /// parameter attribute. The copy will be passed as a byval function parameter.
2681 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2682 SDValue Chain, ISD::ArgFlagsTy Flags,
2683 SelectionDAG &DAG, const SDLoc &dl) {
2684 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2686 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2687 /*isVolatile*/false, /*AlwaysInline=*/true,
2688 /*isTailCall*/false,
2689 MachinePointerInfo(), MachinePointerInfo());
2692 /// Return true if the calling convention is one that we can guarantee TCO for.
2693 static bool canGuaranteeTCO(CallingConv::ID CC) {
2694 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2695 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2696 CC == CallingConv::HHVM);
2699 /// Return true if we might ever do TCO for calls with this calling convention.
2700 static bool mayTailCallThisCC(CallingConv::ID CC) {
2702 // C calling conventions:
2703 case CallingConv::C:
2704 case CallingConv::Win64:
2705 case CallingConv::X86_64_SysV:
2706 // Callee pop conventions:
2707 case CallingConv::X86_ThisCall:
2708 case CallingConv::X86_StdCall:
2709 case CallingConv::X86_VectorCall:
2710 case CallingConv::X86_FastCall:
2713 return canGuaranteeTCO(CC);
2717 /// Return true if the function is being made into a tailcall target by
2718 /// changing its ABI.
2719 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2720 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2723 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2725 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2726 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2729 ImmutableCallSite CS(CI);
2730 CallingConv::ID CalleeCC = CS.getCallingConv();
2731 if (!mayTailCallThisCC(CalleeCC))
2738 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2739 const SmallVectorImpl<ISD::InputArg> &Ins,
2740 const SDLoc &dl, SelectionDAG &DAG,
2741 const CCValAssign &VA,
2742 MachineFrameInfo &MFI, unsigned i) const {
2743 // Create the nodes corresponding to a load from this parameter slot.
2744 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2745 bool AlwaysUseMutable = shouldGuaranteeTCO(
2746 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2747 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2749 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2751 // If value is passed by pointer we have address passed instead of the value
2752 // itself. No need to extend if the mask value and location share the same
2754 bool ExtendedInMem =
2755 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2756 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2758 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2759 ValVT = VA.getLocVT();
2761 ValVT = VA.getValVT();
2763 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2764 // taken by a return address.
2766 if (CallConv == CallingConv::X86_INTR) {
2767 // X86 interrupts may take one or two arguments.
2768 // On the stack there will be no return address as in regular call.
2769 // Offset of last argument need to be set to -4/-8 bytes.
2770 // Where offset of the first argument out of two, should be set to 0 bytes.
2771 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2772 if (Subtarget.is64Bit() && Ins.size() == 2) {
2773 // The stack pointer needs to be realigned for 64 bit handlers with error
2774 // code, so the argument offset changes by 8 bytes.
2779 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2780 // changed with more analysis.
2781 // In case of tail call optimization mark all arguments mutable. Since they
2782 // could be overwritten by lowering of arguments in case of a tail call.
2783 if (Flags.isByVal()) {
2784 unsigned Bytes = Flags.getByValSize();
2785 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2786 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2787 // Adjust SP offset of interrupt parameter.
2788 if (CallConv == CallingConv::X86_INTR) {
2789 MFI.setObjectOffset(FI, Offset);
2791 return DAG.getFrameIndex(FI, PtrVT);
2794 // This is an argument in memory. We might be able to perform copy elision.
2795 if (Flags.isCopyElisionCandidate()) {
2796 EVT ArgVT = Ins[i].ArgVT;
2798 if (Ins[i].PartOffset == 0) {
2799 // If this is a one-part value or the first part of a multi-part value,
2800 // create a stack object for the entire argument value type and return a
2801 // load from our portion of it. This assumes that if the first part of an
2802 // argument is in memory, the rest will also be in memory.
2803 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2804 /*Immutable=*/false);
2805 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2807 ValVT, dl, Chain, PartAddr,
2808 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2810 // This is not the first piece of an argument in memory. See if there is
2811 // already a fixed stack object including this offset. If so, assume it
2812 // was created by the PartOffset == 0 branch above and create a load from
2813 // the appropriate offset into it.
2814 int64_t PartBegin = VA.getLocMemOffset();
2815 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2816 int FI = MFI.getObjectIndexBegin();
2817 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2818 int64_t ObjBegin = MFI.getObjectOffset(FI);
2819 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2820 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2823 if (MFI.isFixedObjectIndex(FI)) {
2825 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2826 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2828 ValVT, dl, Chain, Addr,
2829 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2830 Ins[i].PartOffset));
2835 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2836 VA.getLocMemOffset(), isImmutable);
2838 // Set SExt or ZExt flag.
2839 if (VA.getLocInfo() == CCValAssign::ZExt) {
2840 MFI.setObjectZExt(FI, true);
2841 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2842 MFI.setObjectSExt(FI, true);
2845 // Adjust SP offset of interrupt parameter.
2846 if (CallConv == CallingConv::X86_INTR) {
2847 MFI.setObjectOffset(FI, Offset);
2850 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2851 SDValue Val = DAG.getLoad(
2852 ValVT, dl, Chain, FIN,
2853 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2854 return ExtendedInMem
2855 ? (VA.getValVT().isVector()
2856 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2857 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2861 // FIXME: Get this from tablegen.
2862 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2863 const X86Subtarget &Subtarget) {
2864 assert(Subtarget.is64Bit());
2866 if (Subtarget.isCallingConvWin64(CallConv)) {
2867 static const MCPhysReg GPR64ArgRegsWin64[] = {
2868 X86::RCX, X86::RDX, X86::R8, X86::R9
2870 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2873 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2874 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2876 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2879 // FIXME: Get this from tablegen.
2880 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2881 CallingConv::ID CallConv,
2882 const X86Subtarget &Subtarget) {
2883 assert(Subtarget.is64Bit());
2884 if (Subtarget.isCallingConvWin64(CallConv)) {
2885 // The XMM registers which might contain var arg parameters are shadowed
2886 // in their paired GPR. So we only need to save the GPR to their home
2888 // TODO: __vectorcall will change this.
2892 const Function &F = MF.getFunction();
2893 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2894 bool isSoftFloat = Subtarget.useSoftFloat();
2895 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2896 "SSE register cannot be used when SSE is disabled!");
2897 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2898 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2902 static const MCPhysReg XMMArgRegs64Bit[] = {
2903 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2904 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2906 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2910 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2911 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2912 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2913 return A.getValNo() < B.getValNo();
2918 SDValue X86TargetLowering::LowerFormalArguments(
2919 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2920 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2921 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2922 MachineFunction &MF = DAG.getMachineFunction();
2923 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2924 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2926 const Function &F = MF.getFunction();
2927 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
2928 F.getName() == "main")
2929 FuncInfo->setForceFramePointer(true);
2931 MachineFrameInfo &MFI = MF.getFrameInfo();
2932 bool Is64Bit = Subtarget.is64Bit();
2933 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2936 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2937 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2939 if (CallConv == CallingConv::X86_INTR) {
2940 bool isLegal = Ins.size() == 1 ||
2941 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2942 (!Is64Bit && Ins[1].VT == MVT::i32)));
2944 report_fatal_error("X86 interrupts may take one or two arguments");
2947 // Assign locations to all of the incoming arguments.
2948 SmallVector<CCValAssign, 16> ArgLocs;
2949 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2951 // Allocate shadow area for Win64.
2953 CCInfo.AllocateStack(32, 8);
2955 CCInfo.AnalyzeArguments(Ins, CC_X86);
2957 // In vectorcall calling convention a second pass is required for the HVA
2959 if (CallingConv::X86_VectorCall == CallConv) {
2960 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2963 // The next loop assumes that the locations are in the same order of the
2965 assert(isSortedByValueNo(ArgLocs) &&
2966 "Argument Location list must be sorted before lowering");
2969 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2971 assert(InsIndex < Ins.size() && "Invalid Ins index");
2972 CCValAssign &VA = ArgLocs[I];
2974 if (VA.isRegLoc()) {
2975 EVT RegVT = VA.getLocVT();
2976 if (VA.needsCustom()) {
2978 VA.getValVT() == MVT::v64i1 &&
2979 "Currently the only custom case is when we split v64i1 to 2 regs");
2981 // v64i1 values, in regcall calling convention, that are
2982 // compiled to 32 bit arch, are split up into two registers.
2984 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2986 const TargetRegisterClass *RC;
2987 if (RegVT == MVT::i32)
2988 RC = &X86::GR32RegClass;
2989 else if (Is64Bit && RegVT == MVT::i64)
2990 RC = &X86::GR64RegClass;
2991 else if (RegVT == MVT::f32)
2992 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2993 else if (RegVT == MVT::f64)
2994 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2995 else if (RegVT == MVT::f80)
2996 RC = &X86::RFP80RegClass;
2997 else if (RegVT == MVT::f128)
2998 RC = &X86::FR128RegClass;
2999 else if (RegVT.is512BitVector())
3000 RC = &X86::VR512RegClass;
3001 else if (RegVT.is256BitVector())
3002 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3003 else if (RegVT.is128BitVector())
3004 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3005 else if (RegVT == MVT::x86mmx)
3006 RC = &X86::VR64RegClass;
3007 else if (RegVT == MVT::v1i1)
3008 RC = &X86::VK1RegClass;
3009 else if (RegVT == MVT::v8i1)
3010 RC = &X86::VK8RegClass;
3011 else if (RegVT == MVT::v16i1)
3012 RC = &X86::VK16RegClass;
3013 else if (RegVT == MVT::v32i1)
3014 RC = &X86::VK32RegClass;
3015 else if (RegVT == MVT::v64i1)
3016 RC = &X86::VK64RegClass;
3018 llvm_unreachable("Unknown argument type!");
3020 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3021 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3024 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3025 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3027 if (VA.getLocInfo() == CCValAssign::SExt)
3028 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3029 DAG.getValueType(VA.getValVT()));
3030 else if (VA.getLocInfo() == CCValAssign::ZExt)
3031 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3032 DAG.getValueType(VA.getValVT()));
3033 else if (VA.getLocInfo() == CCValAssign::BCvt)
3034 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3036 if (VA.isExtInLoc()) {
3037 // Handle MMX values passed in XMM regs.
3038 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3039 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3040 else if (VA.getValVT().isVector() &&
3041 VA.getValVT().getScalarType() == MVT::i1 &&
3042 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3043 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3044 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3045 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3047 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3050 assert(VA.isMemLoc());
3052 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3055 // If value is passed via pointer - do a load.
3056 if (VA.getLocInfo() == CCValAssign::Indirect)
3058 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3060 InVals.push_back(ArgValue);
3063 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3064 // Swift calling convention does not require we copy the sret argument
3065 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3066 if (CallConv == CallingConv::Swift)
3069 // All x86 ABIs require that for returning structs by value we copy the
3070 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3071 // the argument into a virtual register so that we can access it from the
3073 if (Ins[I].Flags.isSRet()) {
3074 unsigned Reg = FuncInfo->getSRetReturnReg();
3076 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3077 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3078 FuncInfo->setSRetReturnReg(Reg);
3080 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3081 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3086 unsigned StackSize = CCInfo.getNextStackOffset();
3087 // Align stack specially for tail calls.
3088 if (shouldGuaranteeTCO(CallConv,
3089 MF.getTarget().Options.GuaranteedTailCallOpt))
3090 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3092 // If the function takes variable number of arguments, make a frame index for
3093 // the start of the first vararg value... for expansion of llvm.va_start. We
3094 // can skip this if there are no va_start calls.
3095 if (MFI.hasVAStart() &&
3096 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3097 CallConv != CallingConv::X86_ThisCall))) {
3098 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3101 // Figure out if XMM registers are in use.
3102 assert(!(Subtarget.useSoftFloat() &&
3103 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3104 "SSE register cannot be used when SSE is disabled!");
3106 // 64-bit calling conventions support varargs and register parameters, so we
3107 // have to do extra work to spill them in the prologue.
3108 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3109 // Find the first unallocated argument registers.
3110 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3111 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3112 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3113 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3114 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3115 "SSE register cannot be used when SSE is disabled!");
3117 // Gather all the live in physical registers.
3118 SmallVector<SDValue, 6> LiveGPRs;
3119 SmallVector<SDValue, 8> LiveXMMRegs;
3121 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3122 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3124 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3126 if (!ArgXMMs.empty()) {
3127 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3128 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3129 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3130 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3131 LiveXMMRegs.push_back(
3132 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3137 // Get to the caller-allocated home save location. Add 8 to account
3138 // for the return address.
3139 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3140 FuncInfo->setRegSaveFrameIndex(
3141 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3142 // Fixup to set vararg frame on shadow area (4 x i64).
3144 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3146 // For X86-64, if there are vararg parameters that are passed via
3147 // registers, then we must store them to their spots on the stack so
3148 // they may be loaded by dereferencing the result of va_next.
3149 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3150 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3151 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3152 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3155 // Store the integer parameter registers.
3156 SmallVector<SDValue, 8> MemOps;
3157 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3158 getPointerTy(DAG.getDataLayout()));
3159 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3160 for (SDValue Val : LiveGPRs) {
3161 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3162 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3164 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3165 MachinePointerInfo::getFixedStack(
3166 DAG.getMachineFunction(),
3167 FuncInfo->getRegSaveFrameIndex(), Offset));
3168 MemOps.push_back(Store);
3172 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3173 // Now store the XMM (fp + vector) parameter registers.
3174 SmallVector<SDValue, 12> SaveXMMOps;
3175 SaveXMMOps.push_back(Chain);
3176 SaveXMMOps.push_back(ALVal);
3177 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3178 FuncInfo->getRegSaveFrameIndex(), dl));
3179 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3180 FuncInfo->getVarArgsFPOffset(), dl));
3181 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3183 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3184 MVT::Other, SaveXMMOps));
3187 if (!MemOps.empty())
3188 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3191 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3192 // Find the largest legal vector type.
3193 MVT VecVT = MVT::Other;
3194 // FIXME: Only some x86_32 calling conventions support AVX512.
3195 if (Subtarget.hasAVX512() &&
3196 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3197 CallConv == CallingConv::Intel_OCL_BI)))
3198 VecVT = MVT::v16f32;
3199 else if (Subtarget.hasAVX())
3201 else if (Subtarget.hasSSE2())
3204 // We forward some GPRs and some vector types.
3205 SmallVector<MVT, 2> RegParmTypes;
3206 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3207 RegParmTypes.push_back(IntVT);
3208 if (VecVT != MVT::Other)
3209 RegParmTypes.push_back(VecVT);
3211 // Compute the set of forwarded registers. The rest are scratch.
3212 SmallVectorImpl<ForwardedRegister> &Forwards =
3213 FuncInfo->getForwardedMustTailRegParms();
3214 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3216 // Conservatively forward AL on x86_64, since it might be used for varargs.
3217 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3218 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3219 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3222 // Copy all forwards from physical to virtual registers.
3223 for (ForwardedRegister &F : Forwards) {
3224 // FIXME: Can we use a less constrained schedule?
3225 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3226 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3227 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3231 // Some CCs need callee pop.
3232 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3233 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3234 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3235 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3236 // X86 interrupts must pop the error code (and the alignment padding) if
3238 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3240 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3241 // If this is an sret function, the return should pop the hidden pointer.
3242 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3243 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3244 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3245 FuncInfo->setBytesToPopOnReturn(4);
3249 // RegSaveFrameIndex is X86-64 only.
3250 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3251 if (CallConv == CallingConv::X86_FastCall ||
3252 CallConv == CallingConv::X86_ThisCall)
3253 // fastcc functions can't have varargs.
3254 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3257 FuncInfo->setArgumentStackSize(StackSize);
3259 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3260 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3261 if (Personality == EHPersonality::CoreCLR) {
3263 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3264 // that we'd prefer this slot be allocated towards the bottom of the frame
3265 // (i.e. near the stack pointer after allocating the frame). Every
3266 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3267 // offset from the bottom of this and each funclet's frame must be the
3268 // same, so the size of funclets' (mostly empty) frames is dictated by
3269 // how far this slot is from the bottom (since they allocate just enough
3270 // space to accommodate holding this slot at the correct offset).
3271 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3272 EHInfo->PSPSymFrameIdx = PSPSymFI;
3276 if (CallConv == CallingConv::X86_RegCall ||
3277 F.hasFnAttribute("no_caller_saved_registers")) {
3278 MachineRegisterInfo &MRI = MF.getRegInfo();
3279 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3280 MRI.disableCalleeSavedRegister(Pair.first);
3286 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3287 SDValue Arg, const SDLoc &dl,
3289 const CCValAssign &VA,
3290 ISD::ArgFlagsTy Flags) const {
3291 unsigned LocMemOffset = VA.getLocMemOffset();
3292 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3293 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3295 if (Flags.isByVal())
3296 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3298 return DAG.getStore(
3299 Chain, dl, Arg, PtrOff,
3300 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3303 /// Emit a load of return address if tail call
3304 /// optimization is performed and it is required.
3305 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3306 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3307 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3308 // Adjust the Return address stack slot.
3309 EVT VT = getPointerTy(DAG.getDataLayout());
3310 OutRetAddr = getReturnAddressFrameIndex(DAG);
3312 // Load the "old" Return address.
3313 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3314 return SDValue(OutRetAddr.getNode(), 1);
3317 /// Emit a store of the return address if tail call
3318 /// optimization is performed and it is required (FPDiff!=0).
3319 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3320 SDValue Chain, SDValue RetAddrFrIdx,
3321 EVT PtrVT, unsigned SlotSize,
3322 int FPDiff, const SDLoc &dl) {
3323 // Store the return address to the appropriate stack slot.
3324 if (!FPDiff) return Chain;
3325 // Calculate the new stack slot for the return address.
3326 int NewReturnAddrFI =
3327 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3329 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3330 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3331 MachinePointerInfo::getFixedStack(
3332 DAG.getMachineFunction(), NewReturnAddrFI));
3336 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3337 /// operation of specified width.
3338 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3340 unsigned NumElems = VT.getVectorNumElements();
3341 SmallVector<int, 8> Mask;
3342 Mask.push_back(NumElems);
3343 for (unsigned i = 1; i != NumElems; ++i)
3345 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3349 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3350 SmallVectorImpl<SDValue> &InVals) const {
3351 SelectionDAG &DAG = CLI.DAG;
3353 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3354 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3355 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3356 SDValue Chain = CLI.Chain;
3357 SDValue Callee = CLI.Callee;
3358 CallingConv::ID CallConv = CLI.CallConv;
3359 bool &isTailCall = CLI.IsTailCall;
3360 bool isVarArg = CLI.IsVarArg;
3362 MachineFunction &MF = DAG.getMachineFunction();
3363 bool Is64Bit = Subtarget.is64Bit();
3364 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3365 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3366 bool IsSibcall = false;
3367 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3368 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3369 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3370 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3371 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3372 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3374 if (CallConv == CallingConv::X86_INTR)
3375 report_fatal_error("X86 interrupts may not be called directly");
3377 if (Attr.getValueAsString() == "true")
3380 if (Subtarget.isPICStyleGOT() &&
3381 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3382 // If we are using a GOT, disable tail calls to external symbols with
3383 // default visibility. Tail calling such a symbol requires using a GOT
3384 // relocation, which forces early binding of the symbol. This breaks code
3385 // that require lazy function symbol resolution. Using musttail or
3386 // GuaranteedTailCallOpt will override this.
3387 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3388 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3389 G->getGlobal()->hasDefaultVisibility()))
3393 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3395 // Force this to be a tail call. The verifier rules are enough to ensure
3396 // that we can lower this successfully without moving the return address
3399 } else if (isTailCall) {
3400 // Check if it's really possible to do a tail call.
3401 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3402 isVarArg, SR != NotStructReturn,
3403 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3404 Outs, OutVals, Ins, DAG);
3406 // Sibcalls are automatically detected tailcalls which do not require
3408 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3415 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3416 "Var args not supported with calling convention fastcc, ghc or hipe");
3418 // Analyze operands of the call, assigning locations to each operand.
3419 SmallVector<CCValAssign, 16> ArgLocs;
3420 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3422 // Allocate shadow area for Win64.
3424 CCInfo.AllocateStack(32, 8);
3426 CCInfo.AnalyzeArguments(Outs, CC_X86);
3428 // In vectorcall calling convention a second pass is required for the HVA
3430 if (CallingConv::X86_VectorCall == CallConv) {
3431 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3434 // Get a count of how many bytes are to be pushed on the stack.
3435 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3437 // This is a sibcall. The memory operands are available in caller's
3438 // own caller's stack.
3440 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3441 canGuaranteeTCO(CallConv))
3442 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3445 if (isTailCall && !IsSibcall && !IsMustTail) {
3446 // Lower arguments at fp - stackoffset + fpdiff.
3447 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3449 FPDiff = NumBytesCallerPushed - NumBytes;
3451 // Set the delta of movement of the returnaddr stackslot.
3452 // But only set if delta is greater than previous delta.
3453 if (FPDiff < X86Info->getTCReturnAddrDelta())
3454 X86Info->setTCReturnAddrDelta(FPDiff);
3457 unsigned NumBytesToPush = NumBytes;
3458 unsigned NumBytesToPop = NumBytes;
3460 // If we have an inalloca argument, all stack space has already been allocated
3461 // for us and be right at the top of the stack. We don't support multiple
3462 // arguments passed in memory when using inalloca.
3463 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3465 if (!ArgLocs.back().isMemLoc())
3466 report_fatal_error("cannot use inalloca attribute on a register "
3468 if (ArgLocs.back().getLocMemOffset() != 0)
3469 report_fatal_error("any parameter with the inalloca attribute must be "
3470 "the only memory argument");
3474 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3475 NumBytes - NumBytesToPush, dl);
3477 SDValue RetAddrFrIdx;
3478 // Load return address for tail calls.
3479 if (isTailCall && FPDiff)
3480 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3481 Is64Bit, FPDiff, dl);
3483 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3484 SmallVector<SDValue, 8> MemOpChains;
3487 // The next loop assumes that the locations are in the same order of the
3489 assert(isSortedByValueNo(ArgLocs) &&
3490 "Argument Location list must be sorted before lowering");
3492 // Walk the register/memloc assignments, inserting copies/loads. In the case
3493 // of tail call optimization arguments are handle later.
3494 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3495 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3497 assert(OutIndex < Outs.size() && "Invalid Out index");
3498 // Skip inalloca arguments, they have already been written.
3499 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3500 if (Flags.isInAlloca())
3503 CCValAssign &VA = ArgLocs[I];
3504 EVT RegVT = VA.getLocVT();
3505 SDValue Arg = OutVals[OutIndex];
3506 bool isByVal = Flags.isByVal();
3508 // Promote the value if needed.
3509 switch (VA.getLocInfo()) {
3510 default: llvm_unreachable("Unknown loc info!");
3511 case CCValAssign::Full: break;
3512 case CCValAssign::SExt:
3513 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3515 case CCValAssign::ZExt:
3516 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3518 case CCValAssign::AExt:
3519 if (Arg.getValueType().isVector() &&
3520 Arg.getValueType().getVectorElementType() == MVT::i1)
3521 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3522 else if (RegVT.is128BitVector()) {
3523 // Special case: passing MMX values in XMM registers.
3524 Arg = DAG.getBitcast(MVT::i64, Arg);
3525 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3526 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3528 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3530 case CCValAssign::BCvt:
3531 Arg = DAG.getBitcast(RegVT, Arg);
3533 case CCValAssign::Indirect: {
3534 // Store the argument.
3535 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3536 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3537 Chain = DAG.getStore(
3538 Chain, dl, Arg, SpillSlot,
3539 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3545 if (VA.needsCustom()) {
3546 assert(VA.getValVT() == MVT::v64i1 &&
3547 "Currently the only custom case is when we split v64i1 to 2 regs");
3548 // Split v64i1 value into two registers
3549 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3551 } else if (VA.isRegLoc()) {
3552 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3553 if (isVarArg && IsWin64) {
3554 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3555 // shadow reg if callee is a varargs function.
3556 unsigned ShadowReg = 0;
3557 switch (VA.getLocReg()) {
3558 case X86::XMM0: ShadowReg = X86::RCX; break;
3559 case X86::XMM1: ShadowReg = X86::RDX; break;
3560 case X86::XMM2: ShadowReg = X86::R8; break;
3561 case X86::XMM3: ShadowReg = X86::R9; break;
3564 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3566 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3567 assert(VA.isMemLoc());
3568 if (!StackPtr.getNode())
3569 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3570 getPointerTy(DAG.getDataLayout()));
3571 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3572 dl, DAG, VA, Flags));
3576 if (!MemOpChains.empty())
3577 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3579 if (Subtarget.isPICStyleGOT()) {
3580 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3583 RegsToPass.push_back(std::make_pair(
3584 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3585 getPointerTy(DAG.getDataLayout()))));
3587 // If we are tail calling and generating PIC/GOT style code load the
3588 // address of the callee into ECX. The value in ecx is used as target of
3589 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3590 // for tail calls on PIC/GOT architectures. Normally we would just put the
3591 // address of GOT into ebx and then call target@PLT. But for tail calls
3592 // ebx would be restored (since ebx is callee saved) before jumping to the
3595 // Note: The actual moving to ECX is done further down.
3596 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3597 if (G && !G->getGlobal()->hasLocalLinkage() &&
3598 G->getGlobal()->hasDefaultVisibility())
3599 Callee = LowerGlobalAddress(Callee, DAG);
3600 else if (isa<ExternalSymbolSDNode>(Callee))
3601 Callee = LowerExternalSymbol(Callee, DAG);
3605 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3606 // From AMD64 ABI document:
3607 // For calls that may call functions that use varargs or stdargs
3608 // (prototype-less calls or calls to functions containing ellipsis (...) in
3609 // the declaration) %al is used as hidden argument to specify the number
3610 // of SSE registers used. The contents of %al do not need to match exactly
3611 // the number of registers, but must be an ubound on the number of SSE
3612 // registers used and is in the range 0 - 8 inclusive.
3614 // Count the number of XMM registers allocated.
3615 static const MCPhysReg XMMArgRegs[] = {
3616 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3617 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3619 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3620 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3621 && "SSE registers cannot be used when SSE is disabled");
3623 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3624 DAG.getConstant(NumXMMRegs, dl,
3628 if (isVarArg && IsMustTail) {
3629 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3630 for (const auto &F : Forwards) {
3631 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3632 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3636 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3637 // don't need this because the eligibility check rejects calls that require
3638 // shuffling arguments passed in memory.
3639 if (!IsSibcall && isTailCall) {
3640 // Force all the incoming stack arguments to be loaded from the stack
3641 // before any new outgoing arguments are stored to the stack, because the
3642 // outgoing stack slots may alias the incoming argument stack slots, and
3643 // the alias isn't otherwise explicit. This is slightly more conservative
3644 // than necessary, because it means that each store effectively depends
3645 // on every argument instead of just those arguments it would clobber.
3646 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3648 SmallVector<SDValue, 8> MemOpChains2;
3651 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3653 CCValAssign &VA = ArgLocs[I];
3655 if (VA.isRegLoc()) {
3656 if (VA.needsCustom()) {
3657 assert((CallConv == CallingConv::X86_RegCall) &&
3658 "Expecting custom case only in regcall calling convention");
3659 // This means that we are in special case where one argument was
3660 // passed through two register locations - Skip the next location
3667 assert(VA.isMemLoc());
3668 SDValue Arg = OutVals[OutsIndex];
3669 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3670 // Skip inalloca arguments. They don't require any work.
3671 if (Flags.isInAlloca())
3673 // Create frame index.
3674 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3675 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3676 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3677 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3679 if (Flags.isByVal()) {
3680 // Copy relative to framepointer.
3681 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3682 if (!StackPtr.getNode())
3683 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3684 getPointerTy(DAG.getDataLayout()));
3685 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3688 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3692 // Store relative to framepointer.
3693 MemOpChains2.push_back(DAG.getStore(
3694 ArgChain, dl, Arg, FIN,
3695 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3699 if (!MemOpChains2.empty())
3700 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3702 // Store the return address to the appropriate stack slot.
3703 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3704 getPointerTy(DAG.getDataLayout()),
3705 RegInfo->getSlotSize(), FPDiff, dl);
3708 // Build a sequence of copy-to-reg nodes chained together with token chain
3709 // and flag operands which copy the outgoing args into registers.
3711 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3712 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3713 RegsToPass[i].second, InFlag);
3714 InFlag = Chain.getValue(1);
3717 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3718 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3719 // In the 64-bit large code model, we have to make all calls
3720 // through a register, since the call instruction's 32-bit
3721 // pc-relative offset may not be large enough to hold the whole
3723 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3724 // If the callee is a GlobalAddress node (quite common, every direct call
3725 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3727 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3729 // We should use extra load for direct calls to dllimported functions in
3731 const GlobalValue *GV = G->getGlobal();
3732 if (!GV->hasDLLImportStorageClass()) {
3733 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3735 Callee = DAG.getTargetGlobalAddress(
3736 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3738 if (OpFlags == X86II::MO_GOTPCREL) {
3740 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3741 getPointerTy(DAG.getDataLayout()), Callee);
3742 // Add extra indirection
3743 Callee = DAG.getLoad(
3744 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3745 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3748 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3749 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3750 unsigned char OpFlags =
3751 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3753 Callee = DAG.getTargetExternalSymbol(
3754 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3755 } else if (Subtarget.isTarget64BitILP32() &&
3756 Callee->getValueType(0) == MVT::i32) {
3757 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3758 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3761 // Returns a chain & a flag for retval copy to use.
3762 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3763 SmallVector<SDValue, 8> Ops;
3765 if (!IsSibcall && isTailCall) {
3766 Chain = DAG.getCALLSEQ_END(Chain,
3767 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3768 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3769 InFlag = Chain.getValue(1);
3772 Ops.push_back(Chain);
3773 Ops.push_back(Callee);
3776 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3778 // Add argument registers to the end of the list so that they are known live
3780 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3781 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3782 RegsToPass[i].second.getValueType()));
3784 // Add a register mask operand representing the call-preserved registers.
3785 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3786 // set X86_INTR calling convention because it has the same CSR mask
3787 // (same preserved registers).
3788 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3789 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3790 assert(Mask && "Missing call preserved mask for calling convention");
3792 // If this is an invoke in a 32-bit function using a funclet-based
3793 // personality, assume the function clobbers all registers. If an exception
3794 // is thrown, the runtime will not restore CSRs.
3795 // FIXME: Model this more precisely so that we can register allocate across
3796 // the normal edge and spill and fill across the exceptional edge.
3797 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3798 const Function &CallerFn = MF.getFunction();
3799 EHPersonality Pers =
3800 CallerFn.hasPersonalityFn()
3801 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3802 : EHPersonality::Unknown;
3803 if (isFuncletEHPersonality(Pers))
3804 Mask = RegInfo->getNoPreservedMask();
3807 // Define a new register mask from the existing mask.
3808 uint32_t *RegMask = nullptr;
3810 // In some calling conventions we need to remove the used physical registers
3811 // from the reg mask.
3812 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3813 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3815 // Allocate a new Reg Mask and copy Mask.
3816 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3817 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3818 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3820 // Make sure all sub registers of the argument registers are reset
3822 for (auto const &RegPair : RegsToPass)
3823 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3824 SubRegs.isValid(); ++SubRegs)
3825 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3827 // Create the RegMask Operand according to our updated mask.
3828 Ops.push_back(DAG.getRegisterMask(RegMask));
3830 // Create the RegMask Operand according to the static mask.
3831 Ops.push_back(DAG.getRegisterMask(Mask));
3834 if (InFlag.getNode())
3835 Ops.push_back(InFlag);
3839 //// If this is the first return lowered for this function, add the regs
3840 //// to the liveout set for the function.
3841 // This isn't right, although it's probably harmless on x86; liveouts
3842 // should be computed from returns not tail calls. Consider a void
3843 // function making a tail call to a function returning int.
3844 MF.getFrameInfo().setHasTailCall();
3845 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3848 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3849 InFlag = Chain.getValue(1);
3851 // Create the CALLSEQ_END node.
3852 unsigned NumBytesForCalleeToPop;
3853 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3854 DAG.getTarget().Options.GuaranteedTailCallOpt))
3855 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3856 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3857 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3858 SR == StackStructReturn)
3859 // If this is a call to a struct-return function, the callee
3860 // pops the hidden struct pointer, so we have to push it back.
3861 // This is common for Darwin/X86, Linux & Mingw32 targets.
3862 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3863 NumBytesForCalleeToPop = 4;
3865 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3867 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3868 // No need to reset the stack after the call if the call doesn't return. To
3869 // make the MI verify, we'll pretend the callee does it for us.
3870 NumBytesForCalleeToPop = NumBytes;
3873 // Returns a flag for retval copy to use.
3875 Chain = DAG.getCALLSEQ_END(Chain,
3876 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3877 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3880 InFlag = Chain.getValue(1);
3883 // Handle result values, copying them out of physregs into vregs that we
3885 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3889 //===----------------------------------------------------------------------===//
3890 // Fast Calling Convention (tail call) implementation
3891 //===----------------------------------------------------------------------===//
3893 // Like std call, callee cleans arguments, convention except that ECX is
3894 // reserved for storing the tail called function address. Only 2 registers are
3895 // free for argument passing (inreg). Tail call optimization is performed
3897 // * tailcallopt is enabled
3898 // * caller/callee are fastcc
3899 // On X86_64 architecture with GOT-style position independent code only local
3900 // (within module) calls are supported at the moment.
3901 // To keep the stack aligned according to platform abi the function
3902 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3903 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3904 // If a tail called function callee has more arguments than the caller the
3905 // caller needs to make sure that there is room to move the RETADDR to. This is
3906 // achieved by reserving an area the size of the argument delta right after the
3907 // original RETADDR, but before the saved framepointer or the spilled registers
3908 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3920 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3923 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3924 SelectionDAG& DAG) const {
3925 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3926 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3927 unsigned StackAlignment = TFI.getStackAlignment();
3928 uint64_t AlignMask = StackAlignment - 1;
3929 int64_t Offset = StackSize;
3930 unsigned SlotSize = RegInfo->getSlotSize();
3931 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3932 // Number smaller than 12 so just add the difference.
3933 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3935 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3936 Offset = ((~AlignMask) & Offset) + StackAlignment +
3937 (StackAlignment-SlotSize);
3942 /// Return true if the given stack call argument is already available in the
3943 /// same position (relatively) of the caller's incoming argument stack.
3945 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3946 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3947 const X86InstrInfo *TII, const CCValAssign &VA) {
3948 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3951 // Look through nodes that don't alter the bits of the incoming value.
3952 unsigned Op = Arg.getOpcode();
3953 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3954 Arg = Arg.getOperand(0);
3957 if (Op == ISD::TRUNCATE) {
3958 const SDValue &TruncInput = Arg.getOperand(0);
3959 if (TruncInput.getOpcode() == ISD::AssertZext &&
3960 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3961 Arg.getValueType()) {
3962 Arg = TruncInput.getOperand(0);
3970 if (Arg.getOpcode() == ISD::CopyFromReg) {
3971 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3972 if (!TargetRegisterInfo::isVirtualRegister(VR))
3974 MachineInstr *Def = MRI->getVRegDef(VR);
3977 if (!Flags.isByVal()) {
3978 if (!TII->isLoadFromStackSlot(*Def, FI))
3981 unsigned Opcode = Def->getOpcode();
3982 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3983 Opcode == X86::LEA64_32r) &&
3984 Def->getOperand(1).isFI()) {
3985 FI = Def->getOperand(1).getIndex();
3986 Bytes = Flags.getByValSize();
3990 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3991 if (Flags.isByVal())
3992 // ByVal argument is passed in as a pointer but it's now being
3993 // dereferenced. e.g.
3994 // define @foo(%struct.X* %A) {
3995 // tail call @bar(%struct.X* byval %A)
3998 SDValue Ptr = Ld->getBasePtr();
3999 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4002 FI = FINode->getIndex();
4003 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4004 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4005 FI = FINode->getIndex();
4006 Bytes = Flags.getByValSize();
4010 assert(FI != INT_MAX);
4011 if (!MFI.isFixedObjectIndex(FI))
4014 if (Offset != MFI.getObjectOffset(FI))
4017 // If this is not byval, check that the argument stack object is immutable.
4018 // inalloca and argument copy elision can create mutable argument stack
4019 // objects. Byval objects can be mutated, but a byval call intends to pass the
4021 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4024 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4025 // If the argument location is wider than the argument type, check that any
4026 // extension flags match.
4027 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4028 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4033 return Bytes == MFI.getObjectSize(FI);
4036 /// Check whether the call is eligible for tail call optimization. Targets
4037 /// that want to do tail call optimization should implement this function.
4038 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4039 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4040 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4041 const SmallVectorImpl<ISD::OutputArg> &Outs,
4042 const SmallVectorImpl<SDValue> &OutVals,
4043 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4044 if (!mayTailCallThisCC(CalleeCC))
4047 // If -tailcallopt is specified, make fastcc functions tail-callable.
4048 MachineFunction &MF = DAG.getMachineFunction();
4049 const Function &CallerF = MF.getFunction();
4051 // If the function return type is x86_fp80 and the callee return type is not,
4052 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4053 // perform a tailcall optimization here.
4054 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4057 CallingConv::ID CallerCC = CallerF.getCallingConv();
4058 bool CCMatch = CallerCC == CalleeCC;
4059 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4060 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4062 // Win64 functions have extra shadow space for argument homing. Don't do the
4063 // sibcall if the caller and callee have mismatched expectations for this
4065 if (IsCalleeWin64 != IsCallerWin64)
4068 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4069 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4074 // Look for obvious safe cases to perform tail call optimization that do not
4075 // require ABI changes. This is what gcc calls sibcall.
4077 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4078 // emit a special epilogue.
4079 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4080 if (RegInfo->needsStackRealignment(MF))
4083 // Also avoid sibcall optimization if either caller or callee uses struct
4084 // return semantics.
4085 if (isCalleeStructRet || isCallerStructRet)
4088 // Do not sibcall optimize vararg calls unless all arguments are passed via
4090 LLVMContext &C = *DAG.getContext();
4091 if (isVarArg && !Outs.empty()) {
4092 // Optimizing for varargs on Win64 is unlikely to be safe without
4093 // additional testing.
4094 if (IsCalleeWin64 || IsCallerWin64)
4097 SmallVector<CCValAssign, 16> ArgLocs;
4098 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4100 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4101 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4102 if (!ArgLocs[i].isRegLoc())
4106 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4107 // stack. Therefore, if it's not used by the call it is not safe to optimize
4108 // this into a sibcall.
4109 bool Unused = false;
4110 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4117 SmallVector<CCValAssign, 16> RVLocs;
4118 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4119 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4120 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4121 CCValAssign &VA = RVLocs[i];
4122 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4127 // Check that the call results are passed in the same way.
4128 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4129 RetCC_X86, RetCC_X86))
4131 // The callee has to preserve all registers the caller needs to preserve.
4132 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4133 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4135 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4136 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4140 unsigned StackArgsSize = 0;
4142 // If the callee takes no arguments then go on to check the results of the
4144 if (!Outs.empty()) {
4145 // Check if stack adjustment is needed. For now, do not do this if any
4146 // argument is passed on the stack.
4147 SmallVector<CCValAssign, 16> ArgLocs;
4148 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4150 // Allocate shadow area for Win64
4152 CCInfo.AllocateStack(32, 8);
4154 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4155 StackArgsSize = CCInfo.getNextStackOffset();
4157 if (CCInfo.getNextStackOffset()) {
4158 // Check if the arguments are already laid out in the right way as
4159 // the caller's fixed stack objects.
4160 MachineFrameInfo &MFI = MF.getFrameInfo();
4161 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4162 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4163 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4164 CCValAssign &VA = ArgLocs[i];
4165 SDValue Arg = OutVals[i];
4166 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4167 if (VA.getLocInfo() == CCValAssign::Indirect)
4169 if (!VA.isRegLoc()) {
4170 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4177 bool PositionIndependent = isPositionIndependent();
4178 // If the tailcall address may be in a register, then make sure it's
4179 // possible to register allocate for it. In 32-bit, the call address can
4180 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4181 // callee-saved registers are restored. These happen to be the same
4182 // registers used to pass 'inreg' arguments so watch out for those.
4183 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4184 !isa<ExternalSymbolSDNode>(Callee)) ||
4185 PositionIndependent)) {
4186 unsigned NumInRegs = 0;
4187 // In PIC we need an extra register to formulate the address computation
4189 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4191 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4192 CCValAssign &VA = ArgLocs[i];
4195 unsigned Reg = VA.getLocReg();
4198 case X86::EAX: case X86::EDX: case X86::ECX:
4199 if (++NumInRegs == MaxInRegs)
4206 const MachineRegisterInfo &MRI = MF.getRegInfo();
4207 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4211 bool CalleeWillPop =
4212 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4213 MF.getTarget().Options.GuaranteedTailCallOpt);
4215 if (unsigned BytesToPop =
4216 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4217 // If we have bytes to pop, the callee must pop them.
4218 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4219 if (!CalleePopMatches)
4221 } else if (CalleeWillPop && StackArgsSize > 0) {
4222 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4230 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4231 const TargetLibraryInfo *libInfo) const {
4232 return X86::createFastISel(funcInfo, libInfo);
4235 //===----------------------------------------------------------------------===//
4236 // Other Lowering Hooks
4237 //===----------------------------------------------------------------------===//
4239 static bool MayFoldLoad(SDValue Op) {
4240 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4243 static bool MayFoldIntoStore(SDValue Op) {
4244 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4247 static bool MayFoldIntoZeroExtend(SDValue Op) {
4248 if (Op.hasOneUse()) {
4249 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4250 return (ISD::ZERO_EXTEND == Opcode);
4255 static bool isTargetShuffle(unsigned Opcode) {
4257 default: return false;
4258 case X86ISD::BLENDI:
4259 case X86ISD::PSHUFB:
4260 case X86ISD::PSHUFD:
4261 case X86ISD::PSHUFHW:
4262 case X86ISD::PSHUFLW:
4264 case X86ISD::INSERTPS:
4265 case X86ISD::EXTRQI:
4266 case X86ISD::INSERTQI:
4267 case X86ISD::PALIGNR:
4268 case X86ISD::VSHLDQ:
4269 case X86ISD::VSRLDQ:
4270 case X86ISD::MOVLHPS:
4271 case X86ISD::MOVHLPS:
4272 case X86ISD::MOVLPS:
4273 case X86ISD::MOVLPD:
4274 case X86ISD::MOVSHDUP:
4275 case X86ISD::MOVSLDUP:
4276 case X86ISD::MOVDDUP:
4279 case X86ISD::UNPCKL:
4280 case X86ISD::UNPCKH:
4281 case X86ISD::VBROADCAST:
4282 case X86ISD::VPERMILPI:
4283 case X86ISD::VPERMILPV:
4284 case X86ISD::VPERM2X128:
4285 case X86ISD::VPERMIL2:
4286 case X86ISD::VPERMI:
4287 case X86ISD::VPPERM:
4288 case X86ISD::VPERMV:
4289 case X86ISD::VPERMV3:
4290 case X86ISD::VPERMIV3:
4291 case X86ISD::VZEXT_MOVL:
4296 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4298 default: return false;
4300 case X86ISD::PSHUFB:
4301 case X86ISD::VPERMILPV:
4302 case X86ISD::VPERMIL2:
4303 case X86ISD::VPPERM:
4304 case X86ISD::VPERMV:
4305 case X86ISD::VPERMV3:
4306 case X86ISD::VPERMIV3:
4308 // 'Faux' Target Shuffles.
4315 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4316 MachineFunction &MF = DAG.getMachineFunction();
4317 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4318 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4319 int ReturnAddrIndex = FuncInfo->getRAIndex();
4321 if (ReturnAddrIndex == 0) {
4322 // Set up a frame object for the return address.
4323 unsigned SlotSize = RegInfo->getSlotSize();
4324 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4327 FuncInfo->setRAIndex(ReturnAddrIndex);
4330 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4333 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4334 bool hasSymbolicDisplacement) {
4335 // Offset should fit into 32 bit immediate field.
4336 if (!isInt<32>(Offset))
4339 // If we don't have a symbolic displacement - we don't have any extra
4341 if (!hasSymbolicDisplacement)
4344 // FIXME: Some tweaks might be needed for medium code model.
4345 if (M != CodeModel::Small && M != CodeModel::Kernel)
4348 // For small code model we assume that latest object is 16MB before end of 31
4349 // bits boundary. We may also accept pretty large negative constants knowing
4350 // that all objects are in the positive half of address space.
4351 if (M == CodeModel::Small && Offset < 16*1024*1024)
4354 // For kernel code model we know that all object resist in the negative half
4355 // of 32bits address space. We may not accept negative offsets, since they may
4356 // be just off and we may accept pretty large positive ones.
4357 if (M == CodeModel::Kernel && Offset >= 0)
4363 /// Determines whether the callee is required to pop its own arguments.
4364 /// Callee pop is necessary to support tail calls.
4365 bool X86::isCalleePop(CallingConv::ID CallingConv,
4366 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4367 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4368 // can guarantee TCO.
4369 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4372 switch (CallingConv) {
4375 case CallingConv::X86_StdCall:
4376 case CallingConv::X86_FastCall:
4377 case CallingConv::X86_ThisCall:
4378 case CallingConv::X86_VectorCall:
4383 /// \brief Return true if the condition is an unsigned comparison operation.
4384 static bool isX86CCUnsigned(unsigned X86CC) {
4387 llvm_unreachable("Invalid integer condition!");
4403 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4404 switch (SetCCOpcode) {
4405 default: llvm_unreachable("Invalid integer condition!");
4406 case ISD::SETEQ: return X86::COND_E;
4407 case ISD::SETGT: return X86::COND_G;
4408 case ISD::SETGE: return X86::COND_GE;
4409 case ISD::SETLT: return X86::COND_L;
4410 case ISD::SETLE: return X86::COND_LE;
4411 case ISD::SETNE: return X86::COND_NE;
4412 case ISD::SETULT: return X86::COND_B;
4413 case ISD::SETUGT: return X86::COND_A;
4414 case ISD::SETULE: return X86::COND_BE;
4415 case ISD::SETUGE: return X86::COND_AE;
4419 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4420 /// condition code, returning the condition code and the LHS/RHS of the
4421 /// comparison to make.
4422 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4423 bool isFP, SDValue &LHS, SDValue &RHS,
4424 SelectionDAG &DAG) {
4426 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4427 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4428 // X > -1 -> X == 0, jump !sign.
4429 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4430 return X86::COND_NS;
4432 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4433 // X < 0 -> X == 0, jump on sign.
4436 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4438 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4439 return X86::COND_LE;
4443 return TranslateIntegerX86CC(SetCCOpcode);
4446 // First determine if it is required or is profitable to flip the operands.
4448 // If LHS is a foldable load, but RHS is not, flip the condition.
4449 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4450 !ISD::isNON_EXTLoad(RHS.getNode())) {
4451 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4452 std::swap(LHS, RHS);
4455 switch (SetCCOpcode) {
4461 std::swap(LHS, RHS);
4465 // On a floating point condition, the flags are set as follows:
4467 // 0 | 0 | 0 | X > Y
4468 // 0 | 0 | 1 | X < Y
4469 // 1 | 0 | 0 | X == Y
4470 // 1 | 1 | 1 | unordered
4471 switch (SetCCOpcode) {
4472 default: llvm_unreachable("Condcode should be pre-legalized away");
4474 case ISD::SETEQ: return X86::COND_E;
4475 case ISD::SETOLT: // flipped
4477 case ISD::SETGT: return X86::COND_A;
4478 case ISD::SETOLE: // flipped
4480 case ISD::SETGE: return X86::COND_AE;
4481 case ISD::SETUGT: // flipped
4483 case ISD::SETLT: return X86::COND_B;
4484 case ISD::SETUGE: // flipped
4486 case ISD::SETLE: return X86::COND_BE;
4488 case ISD::SETNE: return X86::COND_NE;
4489 case ISD::SETUO: return X86::COND_P;
4490 case ISD::SETO: return X86::COND_NP;
4492 case ISD::SETUNE: return X86::COND_INVALID;
4496 /// Is there a floating point cmov for the specific X86 condition code?
4497 /// Current x86 isa includes the following FP cmov instructions:
4498 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4499 static bool hasFPCMov(unsigned X86CC) {
4516 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4518 MachineFunction &MF,
4519 unsigned Intrinsic) const {
4521 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4525 Info.opc = ISD::INTRINSIC_W_CHAIN;
4526 Info.flags = MachineMemOperand::MONone;
4529 switch (IntrData->Type) {
4530 case EXPAND_FROM_MEM: {
4531 Info.ptrVal = I.getArgOperand(0);
4532 Info.memVT = MVT::getVT(I.getType());
4534 Info.flags |= MachineMemOperand::MOLoad;
4537 case COMPRESS_TO_MEM: {
4538 Info.ptrVal = I.getArgOperand(0);
4539 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4541 Info.flags |= MachineMemOperand::MOStore;
4544 case TRUNCATE_TO_MEM_VI8:
4545 case TRUNCATE_TO_MEM_VI16:
4546 case TRUNCATE_TO_MEM_VI32: {
4547 Info.ptrVal = I.getArgOperand(0);
4548 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4549 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4550 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4552 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4553 ScalarVT = MVT::i16;
4554 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4555 ScalarVT = MVT::i32;
4557 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4559 Info.flags |= MachineMemOperand::MOStore;
4569 /// Returns true if the target can instruction select the
4570 /// specified FP immediate natively. If false, the legalizer will
4571 /// materialize the FP immediate as a load from a constant pool.
4572 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4573 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4574 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4580 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4581 ISD::LoadExtType ExtTy,
4583 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4584 // relocation target a movq or addq instruction: don't let the load shrink.
4585 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4586 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4587 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4588 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4592 /// \brief Returns true if it is beneficial to convert a load of a constant
4593 /// to just the constant itself.
4594 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4596 assert(Ty->isIntegerTy());
4598 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4599 if (BitSize == 0 || BitSize > 64)
4604 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4605 // TODO: It might be a win to ease or lift this restriction, but the generic
4606 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4607 if (VT.isVector() && Subtarget.hasAVX512())
4613 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4614 unsigned Index) const {
4615 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4618 // Mask vectors support all subregister combinations and operations that
4619 // extract half of vector.
4620 if (ResVT.getVectorElementType() == MVT::i1)
4621 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4622 (Index == ResVT.getVectorNumElements()));
4624 return (Index % ResVT.getVectorNumElements()) == 0;
4627 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4628 // Speculate cttz only if we can directly use TZCNT.
4629 return Subtarget.hasBMI();
4632 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4633 // Speculate ctlz only if we can directly use LZCNT.
4634 return Subtarget.hasLZCNT();
4637 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4638 const SelectionDAG &DAG) const {
4639 // Do not merge to float value size (128 bytes) if no implicit
4640 // float attribute is set.
4641 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4642 Attribute::NoImplicitFloat);
4645 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4646 return (MemVT.getSizeInBits() <= MaxIntSize);
4651 bool X86TargetLowering::isCtlzFast() const {
4652 return Subtarget.hasFastLZCNT();
4655 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4656 const Instruction &AndI) const {
4660 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4661 if (!Subtarget.hasBMI())
4664 // There are only 32-bit and 64-bit forms for 'andn'.
4665 EVT VT = Y.getValueType();
4666 if (VT != MVT::i32 && VT != MVT::i64)
4672 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4673 MVT VT = MVT::getIntegerVT(NumBits);
4674 if (isTypeLegal(VT))
4677 // PMOVMSKB can handle this.
4678 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4681 // VPMOVMSKB can handle this.
4682 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4685 // TODO: Allow 64-bit type for 32-bit target.
4686 // TODO: 512-bit types should be allowed, but make sure that those
4687 // cases are handled in combineVectorSizedSetCCEquality().
4689 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4692 /// Val is the undef sentinel value or equal to the specified value.
4693 static bool isUndefOrEqual(int Val, int CmpVal) {
4694 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4697 /// Val is either the undef or zero sentinel value.
4698 static bool isUndefOrZero(int Val) {
4699 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4702 /// Return true if every element in Mask, beginning
4703 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4704 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4705 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4706 if (Mask[i] != SM_SentinelUndef)
4711 /// Return true if Val is undef or if its value falls within the
4712 /// specified range (L, H].
4713 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4714 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4717 /// Return true if every element in Mask is undef or if its value
4718 /// falls within the specified range (L, H].
4719 static bool isUndefOrInRange(ArrayRef<int> Mask,
4722 if (!isUndefOrInRange(M, Low, Hi))
4727 /// Return true if Val is undef, zero or if its value falls within the
4728 /// specified range (L, H].
4729 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4730 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4733 /// Return true if every element in Mask is undef, zero or if its value
4734 /// falls within the specified range (L, H].
4735 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4737 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4742 /// Return true if every element in Mask, beginning
4743 /// from position Pos and ending in Pos+Size, falls within the specified
4744 /// sequential range (Low, Low+Size]. or is undef.
4745 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4746 unsigned Pos, unsigned Size, int Low) {
4747 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4748 if (!isUndefOrEqual(Mask[i], Low))
4753 /// Return true if every element in Mask, beginning
4754 /// from position Pos and ending in Pos+Size, falls within the specified
4755 /// sequential range (Low, Low+Size], or is undef or is zero.
4756 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4757 unsigned Size, int Low) {
4758 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4759 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4764 /// Return true if every element in Mask, beginning
4765 /// from position Pos and ending in Pos+Size is undef or is zero.
4766 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4768 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4769 if (!isUndefOrZero(Mask[i]))
4774 /// \brief Helper function to test whether a shuffle mask could be
4775 /// simplified by widening the elements being shuffled.
4777 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4778 /// leaves it in an unspecified state.
4780 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4781 /// shuffle masks. The latter have the special property of a '-2' representing
4782 /// a zero-ed lane of a vector.
4783 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4784 SmallVectorImpl<int> &WidenedMask) {
4785 WidenedMask.assign(Mask.size() / 2, 0);
4786 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4788 int M1 = Mask[i + 1];
4790 // If both elements are undef, its trivial.
4791 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4792 WidenedMask[i / 2] = SM_SentinelUndef;
4796 // Check for an undef mask and a mask value properly aligned to fit with
4797 // a pair of values. If we find such a case, use the non-undef mask's value.
4798 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4799 WidenedMask[i / 2] = M1 / 2;
4802 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4803 WidenedMask[i / 2] = M0 / 2;
4807 // When zeroing, we need to spread the zeroing across both lanes to widen.
4808 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4809 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4810 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4811 WidenedMask[i / 2] = SM_SentinelZero;
4817 // Finally check if the two mask values are adjacent and aligned with
4819 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4820 WidenedMask[i / 2] = M0 / 2;
4824 // Otherwise we can't safely widen the elements used in this shuffle.
4827 assert(WidenedMask.size() == Mask.size() / 2 &&
4828 "Incorrect size of mask after widening the elements!");
4833 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4834 bool X86::isZeroNode(SDValue Elt) {
4835 return isNullConstant(Elt) || isNullFPConstant(Elt);
4838 // Build a vector of constants.
4839 // Use an UNDEF node if MaskElt == -1.
4840 // Split 64-bit constants in the 32-bit mode.
4841 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4842 const SDLoc &dl, bool IsMask = false) {
4844 SmallVector<SDValue, 32> Ops;
4847 MVT ConstVecVT = VT;
4848 unsigned NumElts = VT.getVectorNumElements();
4849 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4850 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4851 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4855 MVT EltVT = ConstVecVT.getVectorElementType();
4856 for (unsigned i = 0; i < NumElts; ++i) {
4857 bool IsUndef = Values[i] < 0 && IsMask;
4858 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4859 DAG.getConstant(Values[i], dl, EltVT);
4860 Ops.push_back(OpNode);
4862 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4863 DAG.getConstant(0, dl, EltVT));
4865 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4867 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4871 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4872 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4873 assert(Bits.size() == Undefs.getBitWidth() &&
4874 "Unequal constant and undef arrays");
4875 SmallVector<SDValue, 32> Ops;
4878 MVT ConstVecVT = VT;
4879 unsigned NumElts = VT.getVectorNumElements();
4880 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4881 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4882 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4886 MVT EltVT = ConstVecVT.getVectorElementType();
4887 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4889 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4892 const APInt &V = Bits[i];
4893 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4895 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4896 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4897 } else if (EltVT == MVT::f32) {
4898 APFloat FV(APFloat::IEEEsingle(), V);
4899 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4900 } else if (EltVT == MVT::f64) {
4901 APFloat FV(APFloat::IEEEdouble(), V);
4902 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4904 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4908 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4909 return DAG.getBitcast(VT, ConstsNode);
4912 /// Returns a vector of specified type with all zero elements.
4913 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4914 SelectionDAG &DAG, const SDLoc &dl) {
4915 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4916 VT.getVectorElementType() == MVT::i1) &&
4917 "Unexpected vector type");
4919 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4920 // type. This ensures they get CSE'd. But if the integer type is not
4921 // available, use a floating-point +0.0 instead.
4923 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4924 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4925 } else if (VT.getVectorElementType() == MVT::i1) {
4926 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4927 "Unexpected vector type");
4928 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4929 "Unexpected vector type");
4930 Vec = DAG.getConstant(0, dl, VT);
4932 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4933 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4935 return DAG.getBitcast(VT, Vec);
4938 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4939 const SDLoc &dl, unsigned vectorWidth) {
4940 EVT VT = Vec.getValueType();
4941 EVT ElVT = VT.getVectorElementType();
4942 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4943 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4944 VT.getVectorNumElements()/Factor);
4946 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4947 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4948 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4950 // This is the index of the first element of the vectorWidth-bit chunk
4951 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4952 IdxVal &= ~(ElemsPerChunk - 1);
4954 // If the input is a buildvector just emit a smaller one.
4955 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4956 return DAG.getBuildVector(ResultVT, dl,
4957 Vec->ops().slice(IdxVal, ElemsPerChunk));
4959 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4960 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4963 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4964 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4965 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4966 /// instructions or a simple subregister reference. Idx is an index in the
4967 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4968 /// lowering EXTRACT_VECTOR_ELT operations easier.
4969 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4970 SelectionDAG &DAG, const SDLoc &dl) {
4971 assert((Vec.getValueType().is256BitVector() ||
4972 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4973 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4976 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4977 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4978 SelectionDAG &DAG, const SDLoc &dl) {
4979 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4980 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4983 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4984 SelectionDAG &DAG, const SDLoc &dl,
4985 unsigned vectorWidth) {
4986 assert((vectorWidth == 128 || vectorWidth == 256) &&
4987 "Unsupported vector width");
4988 // Inserting UNDEF is Result
4991 EVT VT = Vec.getValueType();
4992 EVT ElVT = VT.getVectorElementType();
4993 EVT ResultVT = Result.getValueType();
4995 // Insert the relevant vectorWidth bits.
4996 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4997 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4999 // This is the index of the first element of the vectorWidth-bit chunk
5000 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5001 IdxVal &= ~(ElemsPerChunk - 1);
5003 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5004 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5007 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5008 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5009 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5010 /// simple superregister reference. Idx is an index in the 128 bits
5011 /// we want. It need not be aligned to a 128-bit boundary. That makes
5012 /// lowering INSERT_VECTOR_ELT operations easier.
5013 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5014 SelectionDAG &DAG, const SDLoc &dl) {
5015 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5016 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5019 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5020 SelectionDAG &DAG, const SDLoc &dl) {
5021 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5022 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5025 // Return true if the instruction zeroes the unused upper part of the
5026 // destination and accepts mask.
5027 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5032 case X86ISD::TESTNM:
5033 case X86ISD::PCMPEQM:
5034 case X86ISD::PCMPGTM:
5037 case X86ISD::CMPM_RND:
5042 /// Insert i1-subvector to i1-vector.
5043 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5044 const X86Subtarget &Subtarget) {
5047 SDValue Vec = Op.getOperand(0);
5048 SDValue SubVec = Op.getOperand(1);
5049 SDValue Idx = Op.getOperand(2);
5051 if (!isa<ConstantSDNode>(Idx))
5054 // Inserting undef is a nop. We can just return the original vector.
5055 if (SubVec.isUndef())
5058 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5059 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5062 MVT OpVT = Op.getSimpleValueType();
5063 unsigned NumElems = OpVT.getVectorNumElements();
5065 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5067 // Extend to natively supported kshift.
5068 MVT WideOpVT = OpVT;
5069 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5070 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5072 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5074 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5075 // May need to promote to a legal type.
5076 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5077 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5079 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5082 MVT SubVecVT = SubVec.getSimpleValueType();
5083 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5085 assert(IdxVal + SubVecNumElems <= NumElems &&
5086 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5087 "Unexpected index value in INSERT_SUBVECTOR");
5089 SDValue Undef = DAG.getUNDEF(WideOpVT);
5092 // Zero lower bits of the Vec
5093 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5094 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5096 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5097 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5098 // Merge them together, SubVec should be zero extended.
5099 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5100 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5102 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5103 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5106 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5107 Undef, SubVec, ZeroIdx);
5109 if (Vec.isUndef()) {
5110 assert(IdxVal != 0 && "Unexpected index");
5111 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5112 DAG.getConstant(IdxVal, dl, MVT::i8));
5113 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5116 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5117 assert(IdxVal != 0 && "Unexpected index");
5118 NumElems = WideOpVT.getVectorNumElements();
5119 unsigned ShiftLeft = NumElems - SubVecNumElems;
5120 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5121 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5122 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5123 if (ShiftRight != 0)
5124 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5125 DAG.getConstant(ShiftRight, dl, MVT::i8));
5126 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5129 // Simple case when we put subvector in the upper part
5130 if (IdxVal + SubVecNumElems == NumElems) {
5131 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5132 DAG.getConstant(IdxVal, dl, MVT::i8));
5133 if (SubVecNumElems * 2 == NumElems) {
5134 // Special case, use legal zero extending insert_subvector. This allows
5135 // isel to opimitize when bits are known zero.
5136 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5137 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5138 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5141 // Otherwise use explicit shifts to zero the bits.
5142 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5143 Undef, Vec, ZeroIdx);
5144 NumElems = WideOpVT.getVectorNumElements();
5145 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5146 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5147 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5149 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5150 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5153 // Inserting into the middle is more complicated.
5155 NumElems = WideOpVT.getVectorNumElements();
5157 // Widen the vector if needed.
5158 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5159 // Move the current value of the bit to be replace to the lsbs.
5160 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5161 DAG.getConstant(IdxVal, dl, MVT::i8));
5162 // Xor with the new bit.
5163 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5164 // Shift to MSB, filling bottom bits with 0.
5165 unsigned ShiftLeft = NumElems - SubVecNumElems;
5166 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5167 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5168 // Shift to the final position, filling upper bits with 0.
5169 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5170 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5171 DAG.getConstant(ShiftRight, dl, MVT::i8));
5172 // Xor with original vector leaving the new value.
5173 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5174 // Reduce to original width if needed.
5175 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5178 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5179 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5180 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5181 /// large BUILD_VECTORS.
5182 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5183 unsigned NumElems, SelectionDAG &DAG,
5185 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5186 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5189 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5190 unsigned NumElems, SelectionDAG &DAG,
5192 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5193 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5196 /// Returns a vector of specified type with all bits set.
5197 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5198 /// Then bitcast to their original type, ensuring they get CSE'd.
5199 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5200 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5201 "Expected a 128/256/512-bit vector type");
5203 APInt Ones = APInt::getAllOnesValue(32);
5204 unsigned NumElts = VT.getSizeInBits() / 32;
5205 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5206 return DAG.getBitcast(VT, Vec);
5209 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5210 SelectionDAG &DAG) {
5211 EVT InVT = In.getValueType();
5212 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5214 if (VT.is128BitVector() && InVT.is128BitVector())
5215 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5216 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5218 // For 256-bit vectors, we only need the lower (128-bit) input half.
5219 // For 512-bit vectors, we only need the lower input half or quarter.
5220 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5221 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5222 In = extractSubVector(In, 0, DAG, DL,
5223 std::max(128, (int)VT.getSizeInBits() / Scale));
5226 return DAG.getNode(Opc, DL, VT, In);
5229 /// Returns a vector_shuffle node for an unpackl operation.
5230 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5231 SDValue V1, SDValue V2) {
5232 SmallVector<int, 8> Mask;
5233 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5234 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5237 /// Returns a vector_shuffle node for an unpackh operation.
5238 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5239 SDValue V1, SDValue V2) {
5240 SmallVector<int, 8> Mask;
5241 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5242 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5245 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5246 /// This produces a shuffle where the low element of V2 is swizzled into the
5247 /// zero/undef vector, landing at element Idx.
5248 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5249 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5251 const X86Subtarget &Subtarget,
5252 SelectionDAG &DAG) {
5253 MVT VT = V2.getSimpleValueType();
5255 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5256 int NumElems = VT.getVectorNumElements();
5257 SmallVector<int, 16> MaskVec(NumElems);
5258 for (int i = 0; i != NumElems; ++i)
5259 // If this is the insertion idx, put the low elt of V2 here.
5260 MaskVec[i] = (i == Idx) ? NumElems : i;
5261 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5264 static SDValue peekThroughBitcasts(SDValue V) {
5265 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5266 V = V.getOperand(0);
5270 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5271 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5272 V.getOperand(0).hasOneUse())
5273 V = V.getOperand(0);
5277 static const Constant *getTargetConstantFromNode(SDValue Op) {
5278 Op = peekThroughBitcasts(Op);
5280 auto *Load = dyn_cast<LoadSDNode>(Op);
5284 SDValue Ptr = Load->getBasePtr();
5285 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5286 Ptr->getOpcode() == X86ISD::WrapperRIP)
5287 Ptr = Ptr->getOperand(0);
5289 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5290 if (!CNode || CNode->isMachineConstantPoolEntry())
5293 return dyn_cast<Constant>(CNode->getConstVal());
5296 // Extract raw constant bits from constant pools.
5297 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5299 SmallVectorImpl<APInt> &EltBits,
5300 bool AllowWholeUndefs = true,
5301 bool AllowPartialUndefs = true) {
5302 assert(EltBits.empty() && "Expected an empty EltBits vector");
5304 Op = peekThroughBitcasts(Op);
5306 EVT VT = Op.getValueType();
5307 unsigned SizeInBits = VT.getSizeInBits();
5308 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5309 unsigned NumElts = SizeInBits / EltSizeInBits;
5311 // Bitcast a source array of element bits to the target size.
5312 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5313 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5314 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5315 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5316 "Constant bit sizes don't match");
5318 // Don't split if we don't allow undef bits.
5319 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5320 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5323 // If we're already the right size, don't bother bitcasting.
5324 if (NumSrcElts == NumElts) {
5325 UndefElts = UndefSrcElts;
5326 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5330 // Extract all the undef/constant element data and pack into single bitsets.
5331 APInt UndefBits(SizeInBits, 0);
5332 APInt MaskBits(SizeInBits, 0);
5334 for (unsigned i = 0; i != NumSrcElts; ++i) {
5335 unsigned BitOffset = i * SrcEltSizeInBits;
5336 if (UndefSrcElts[i])
5337 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5338 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5341 // Split the undef/constant single bitset data into the target elements.
5342 UndefElts = APInt(NumElts, 0);
5343 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5345 for (unsigned i = 0; i != NumElts; ++i) {
5346 unsigned BitOffset = i * EltSizeInBits;
5347 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5349 // Only treat an element as UNDEF if all bits are UNDEF.
5350 if (UndefEltBits.isAllOnesValue()) {
5351 if (!AllowWholeUndefs)
5353 UndefElts.setBit(i);
5357 // If only some bits are UNDEF then treat them as zero (or bail if not
5359 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5362 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5363 EltBits[i] = Bits.getZExtValue();
5368 // Collect constant bits and insert into mask/undef bit masks.
5369 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5370 unsigned UndefBitIndex) {
5373 if (isa<UndefValue>(Cst)) {
5374 Undefs.setBit(UndefBitIndex);
5377 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5378 Mask = CInt->getValue();
5381 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5382 Mask = CFP->getValueAPF().bitcastToAPInt();
5390 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5391 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5392 return CastBitData(UndefSrcElts, SrcEltBits);
5395 // Extract scalar constant bits.
5396 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5397 APInt UndefSrcElts = APInt::getNullValue(1);
5398 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5399 return CastBitData(UndefSrcElts, SrcEltBits);
5402 // Extract constant bits from build vector.
5403 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5404 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5405 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5407 APInt UndefSrcElts(NumSrcElts, 0);
5408 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5409 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5410 const SDValue &Src = Op.getOperand(i);
5411 if (Src.isUndef()) {
5412 UndefSrcElts.setBit(i);
5415 auto *Cst = cast<ConstantSDNode>(Src);
5416 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5418 return CastBitData(UndefSrcElts, SrcEltBits);
5421 // Extract constant bits from constant pool vector.
5422 if (auto *Cst = getTargetConstantFromNode(Op)) {
5423 Type *CstTy = Cst->getType();
5424 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5427 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5428 unsigned NumSrcElts = CstTy->getVectorNumElements();
5430 APInt UndefSrcElts(NumSrcElts, 0);
5431 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5432 for (unsigned i = 0; i != NumSrcElts; ++i)
5433 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5437 return CastBitData(UndefSrcElts, SrcEltBits);
5440 // Extract constant bits from a broadcasted constant pool scalar.
5441 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5442 EltSizeInBits <= VT.getScalarSizeInBits()) {
5443 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5444 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5445 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5447 APInt UndefSrcElts(NumSrcElts, 0);
5448 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5449 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5450 if (UndefSrcElts[0])
5451 UndefSrcElts.setBits(0, NumSrcElts);
5452 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5453 return CastBitData(UndefSrcElts, SrcEltBits);
5458 // Extract a rematerialized scalar constant insertion.
5459 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5460 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5461 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5462 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5463 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5465 APInt UndefSrcElts(NumSrcElts, 0);
5466 SmallVector<APInt, 64> SrcEltBits;
5467 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5468 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5469 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5470 return CastBitData(UndefSrcElts, SrcEltBits);
5476 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5477 unsigned MaskEltSizeInBits,
5478 SmallVectorImpl<uint64_t> &RawMask) {
5480 SmallVector<APInt, 64> EltBits;
5482 // Extract the raw target constant bits.
5483 // FIXME: We currently don't support UNDEF bits or mask entries.
5484 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5485 EltBits, /* AllowWholeUndefs */ false,
5486 /* AllowPartialUndefs */ false))
5489 // Insert the extracted elements into the mask.
5490 for (APInt Elt : EltBits)
5491 RawMask.push_back(Elt.getZExtValue());
5496 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5497 /// Note: This ignores saturation, so inputs must be checked first.
5498 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5500 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5501 unsigned NumElts = VT.getVectorNumElements();
5502 unsigned NumLanes = VT.getSizeInBits() / 128;
5503 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5504 unsigned Offset = Unary ? 0 : NumElts;
5506 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5507 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5508 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5509 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5510 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5514 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5515 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5516 /// operands in \p Ops, and returns true.
5517 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5518 /// IsUnary for shuffles which use a single input multiple times, and in those
5519 /// cases it will adjust the mask to only have indices within that single input.
5520 /// It is an error to call this with non-empty Mask/Ops vectors.
5521 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5522 SmallVectorImpl<SDValue> &Ops,
5523 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5524 unsigned NumElems = VT.getVectorNumElements();
5527 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5528 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5531 bool IsFakeUnary = false;
5532 switch(N->getOpcode()) {
5533 case X86ISD::BLENDI:
5534 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5535 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5536 ImmN = N->getOperand(N->getNumOperands()-1);
5537 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5538 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5541 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5542 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5543 ImmN = N->getOperand(N->getNumOperands()-1);
5544 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5545 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5547 case X86ISD::INSERTPS:
5548 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5549 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5550 ImmN = N->getOperand(N->getNumOperands()-1);
5551 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5552 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5554 case X86ISD::EXTRQI:
5555 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5556 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5557 isa<ConstantSDNode>(N->getOperand(2))) {
5558 int BitLen = N->getConstantOperandVal(1);
5559 int BitIdx = N->getConstantOperandVal(2);
5560 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5564 case X86ISD::INSERTQI:
5565 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5566 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5567 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5568 isa<ConstantSDNode>(N->getOperand(3))) {
5569 int BitLen = N->getConstantOperandVal(2);
5570 int BitIdx = N->getConstantOperandVal(3);
5571 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5572 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5575 case X86ISD::UNPCKH:
5576 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5577 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5578 DecodeUNPCKHMask(VT, Mask);
5579 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5581 case X86ISD::UNPCKL:
5582 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5583 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5584 DecodeUNPCKLMask(VT, Mask);
5585 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5587 case X86ISD::MOVHLPS:
5588 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5589 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5590 DecodeMOVHLPSMask(NumElems, Mask);
5591 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5593 case X86ISD::MOVLHPS:
5594 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5595 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5596 DecodeMOVLHPSMask(NumElems, Mask);
5597 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5599 case X86ISD::PALIGNR:
5600 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5601 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5602 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5603 ImmN = N->getOperand(N->getNumOperands()-1);
5604 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5605 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5606 Ops.push_back(N->getOperand(1));
5607 Ops.push_back(N->getOperand(0));
5609 case X86ISD::VSHLDQ:
5610 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5611 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5612 ImmN = N->getOperand(N->getNumOperands() - 1);
5613 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5616 case X86ISD::VSRLDQ:
5617 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5618 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5619 ImmN = N->getOperand(N->getNumOperands() - 1);
5620 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5623 case X86ISD::PSHUFD:
5624 case X86ISD::VPERMILPI:
5625 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5626 ImmN = N->getOperand(N->getNumOperands()-1);
5627 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5630 case X86ISD::PSHUFHW:
5631 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5632 ImmN = N->getOperand(N->getNumOperands()-1);
5633 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5636 case X86ISD::PSHUFLW:
5637 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5638 ImmN = N->getOperand(N->getNumOperands()-1);
5639 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5642 case X86ISD::VZEXT_MOVL:
5643 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5644 DecodeZeroMoveLowMask(VT, Mask);
5647 case X86ISD::VBROADCAST: {
5648 SDValue N0 = N->getOperand(0);
5649 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5650 // add the pre-extracted value to the Ops vector.
5651 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5652 N0.getOperand(0).getValueType() == VT &&
5653 N0.getConstantOperandVal(1) == 0)
5654 Ops.push_back(N0.getOperand(0));
5656 // We only decode broadcasts of same-sized vectors, unless the broadcast
5657 // came from an extract from the original width. If we found one, we
5658 // pushed it the Ops vector above.
5659 if (N0.getValueType() == VT || !Ops.empty()) {
5660 DecodeVectorBroadcast(VT, Mask);
5666 case X86ISD::VPERMILPV: {
5667 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5669 SDValue MaskNode = N->getOperand(1);
5670 unsigned MaskEltSize = VT.getScalarSizeInBits();
5671 SmallVector<uint64_t, 32> RawMask;
5672 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5673 DecodeVPERMILPMask(VT, RawMask, Mask);
5676 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5677 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5682 case X86ISD::PSHUFB: {
5683 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5684 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5685 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5687 SDValue MaskNode = N->getOperand(1);
5688 SmallVector<uint64_t, 32> RawMask;
5689 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5690 DecodePSHUFBMask(RawMask, Mask);
5693 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5694 DecodePSHUFBMask(C, Mask);
5699 case X86ISD::VPERMI:
5700 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5701 ImmN = N->getOperand(N->getNumOperands()-1);
5702 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5707 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5708 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5709 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5711 case X86ISD::VPERM2X128:
5712 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5713 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5714 ImmN = N->getOperand(N->getNumOperands()-1);
5715 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5716 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5718 case X86ISD::MOVSLDUP:
5719 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5720 DecodeMOVSLDUPMask(VT, Mask);
5723 case X86ISD::MOVSHDUP:
5724 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5725 DecodeMOVSHDUPMask(VT, Mask);
5728 case X86ISD::MOVDDUP:
5729 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5730 DecodeMOVDDUPMask(VT, Mask);
5733 case X86ISD::MOVLPD:
5734 case X86ISD::MOVLPS:
5735 // Not yet implemented
5737 case X86ISD::VPERMIL2: {
5738 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5739 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5740 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5741 unsigned MaskEltSize = VT.getScalarSizeInBits();
5742 SDValue MaskNode = N->getOperand(2);
5743 SDValue CtrlNode = N->getOperand(3);
5744 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5745 unsigned CtrlImm = CtrlOp->getZExtValue();
5746 SmallVector<uint64_t, 32> RawMask;
5747 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5748 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5751 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5752 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5758 case X86ISD::VPPERM: {
5759 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5760 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5761 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5762 SDValue MaskNode = N->getOperand(2);
5763 SmallVector<uint64_t, 32> RawMask;
5764 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5765 DecodeVPPERMMask(RawMask, Mask);
5768 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5769 DecodeVPPERMMask(C, Mask);
5774 case X86ISD::VPERMV: {
5775 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5777 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5778 Ops.push_back(N->getOperand(1));
5779 SDValue MaskNode = N->getOperand(0);
5780 SmallVector<uint64_t, 32> RawMask;
5781 unsigned MaskEltSize = VT.getScalarSizeInBits();
5782 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5783 DecodeVPERMVMask(RawMask, Mask);
5786 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5787 DecodeVPERMVMask(C, MaskEltSize, Mask);
5792 case X86ISD::VPERMV3: {
5793 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5794 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5795 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5796 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5797 Ops.push_back(N->getOperand(0));
5798 Ops.push_back(N->getOperand(2));
5799 SDValue MaskNode = N->getOperand(1);
5800 unsigned MaskEltSize = VT.getScalarSizeInBits();
5801 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5802 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5807 case X86ISD::VPERMIV3: {
5808 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5809 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5810 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5811 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5812 Ops.push_back(N->getOperand(1));
5813 Ops.push_back(N->getOperand(2));
5814 SDValue MaskNode = N->getOperand(0);
5815 unsigned MaskEltSize = VT.getScalarSizeInBits();
5816 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5817 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5822 default: llvm_unreachable("unknown target shuffle node");
5825 // Empty mask indicates the decode failed.
5829 // Check if we're getting a shuffle mask with zero'd elements.
5830 if (!AllowSentinelZero)
5831 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5834 // If we have a fake unary shuffle, the shuffle mask is spread across two
5835 // inputs that are actually the same node. Re-map the mask to always point
5836 // into the first input.
5839 if (M >= (int)Mask.size())
5842 // If we didn't already add operands in the opcode-specific code, default to
5843 // adding 1 or 2 operands starting at 0.
5845 Ops.push_back(N->getOperand(0));
5846 if (!IsUnary || IsFakeUnary)
5847 Ops.push_back(N->getOperand(1));
5853 /// Check a target shuffle mask's inputs to see if we can set any values to
5854 /// SM_SentinelZero - this is for elements that are known to be zero
5855 /// (not just zeroable) from their inputs.
5856 /// Returns true if the target shuffle mask was decoded.
5857 static bool setTargetShuffleZeroElements(SDValue N,
5858 SmallVectorImpl<int> &Mask,
5859 SmallVectorImpl<SDValue> &Ops) {
5861 if (!isTargetShuffle(N.getOpcode()))
5864 MVT VT = N.getSimpleValueType();
5865 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5868 SDValue V1 = Ops[0];
5869 SDValue V2 = IsUnary ? V1 : Ops[1];
5871 V1 = peekThroughBitcasts(V1);
5872 V2 = peekThroughBitcasts(V2);
5874 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5875 "Illegal split of shuffle value type");
5876 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5878 // Extract known constant input data.
5879 APInt UndefSrcElts[2];
5880 SmallVector<APInt, 32> SrcEltBits[2];
5881 bool IsSrcConstant[2] = {
5882 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5883 SrcEltBits[0], true, false),
5884 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5885 SrcEltBits[1], true, false)};
5887 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5890 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5894 // Determine shuffle input and normalize the mask.
5895 unsigned SrcIdx = M / Size;
5896 SDValue V = M < Size ? V1 : V2;
5899 // We are referencing an UNDEF input.
5901 Mask[i] = SM_SentinelUndef;
5905 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5906 // TODO: We currently only set UNDEF for integer types - floats use the same
5907 // registers as vectors and many of the scalar folded loads rely on the
5908 // SCALAR_TO_VECTOR pattern.
5909 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5910 (Size % V.getValueType().getVectorNumElements()) == 0) {
5911 int Scale = Size / V.getValueType().getVectorNumElements();
5912 int Idx = M / Scale;
5913 if (Idx != 0 && !VT.isFloatingPoint())
5914 Mask[i] = SM_SentinelUndef;
5915 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5916 Mask[i] = SM_SentinelZero;
5920 // Attempt to extract from the source's constant bits.
5921 if (IsSrcConstant[SrcIdx]) {
5922 if (UndefSrcElts[SrcIdx][M])
5923 Mask[i] = SM_SentinelUndef;
5924 else if (SrcEltBits[SrcIdx][M] == 0)
5925 Mask[i] = SM_SentinelZero;
5929 assert(VT.getVectorNumElements() == Mask.size() &&
5930 "Different mask size from vector size!");
5934 // Attempt to decode ops that could be represented as a shuffle mask.
5935 // The decoded shuffle mask may contain a different number of elements to the
5936 // destination value type.
5937 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5938 SmallVectorImpl<SDValue> &Ops,
5939 SelectionDAG &DAG) {
5943 MVT VT = N.getSimpleValueType();
5944 unsigned NumElts = VT.getVectorNumElements();
5945 unsigned NumSizeInBits = VT.getSizeInBits();
5946 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5947 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5948 "Expected byte aligned value types");
5950 unsigned Opcode = N.getOpcode();
5953 case X86ISD::ANDNP: {
5954 // Attempt to decode as a per-byte mask.
5956 SmallVector<APInt, 32> EltBits;
5957 SDValue N0 = N.getOperand(0);
5958 SDValue N1 = N.getOperand(1);
5959 bool IsAndN = (X86ISD::ANDNP == Opcode);
5960 uint64_t ZeroMask = IsAndN ? 255 : 0;
5961 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5963 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5965 Mask.push_back(SM_SentinelUndef);
5968 uint64_t ByteBits = EltBits[i].getZExtValue();
5969 if (ByteBits != 0 && ByteBits != 255)
5971 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5973 Ops.push_back(IsAndN ? N1 : N0);
5976 case ISD::SCALAR_TO_VECTOR: {
5977 // Match against a scalar_to_vector of an extract from a vector,
5978 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5979 SDValue N0 = N.getOperand(0);
5982 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5983 N0.getOperand(0).getValueType() == VT) ||
5984 (N0.getOpcode() == X86ISD::PEXTRW &&
5985 N0.getOperand(0).getValueType() == MVT::v8i16) ||
5986 (N0.getOpcode() == X86ISD::PEXTRB &&
5987 N0.getOperand(0).getValueType() == MVT::v16i8)) {
5991 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5994 SDValue SrcVec = SrcExtract.getOperand(0);
5995 EVT SrcVT = SrcVec.getValueType();
5996 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5997 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5999 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6000 if (NumSrcElts <= SrcIdx)
6003 Ops.push_back(SrcVec);
6004 Mask.push_back(SrcIdx);
6005 Mask.append(NumZeros, SM_SentinelZero);
6006 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6009 case X86ISD::PINSRB:
6010 case X86ISD::PINSRW: {
6011 SDValue InVec = N.getOperand(0);
6012 SDValue InScl = N.getOperand(1);
6013 uint64_t InIdx = N.getConstantOperandVal(2);
6014 assert(InIdx < NumElts && "Illegal insertion index");
6016 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6017 if (X86::isZeroNode(InScl)) {
6018 Ops.push_back(InVec);
6019 for (unsigned i = 0; i != NumElts; ++i)
6020 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6024 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6025 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6027 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6028 if (InScl.getOpcode() != ExOp)
6031 SDValue ExVec = InScl.getOperand(0);
6032 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6033 assert(ExIdx < NumElts && "Illegal extraction index");
6034 Ops.push_back(InVec);
6035 Ops.push_back(ExVec);
6036 for (unsigned i = 0; i != NumElts; ++i)
6037 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6040 case X86ISD::PACKSS:
6041 case X86ISD::PACKUS: {
6042 SDValue N0 = N.getOperand(0);
6043 SDValue N1 = N.getOperand(1);
6044 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6045 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6046 "Unexpected input value type");
6048 // If we know input saturation won't happen we can treat this
6049 // as a truncation shuffle.
6050 if (Opcode == X86ISD::PACKSS) {
6051 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6052 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6055 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6056 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6057 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6061 bool IsUnary = (N0 == N1);
6067 createPackShuffleMask(VT, Mask, IsUnary);
6071 case X86ISD::VSRLI: {
6072 uint64_t ShiftVal = N.getConstantOperandVal(1);
6073 // Out of range bit shifts are guaranteed to be zero.
6074 if (NumBitsPerElt <= ShiftVal) {
6075 Mask.append(NumElts, SM_SentinelZero);
6079 // We can only decode 'whole byte' bit shifts as shuffles.
6080 if ((ShiftVal % 8) != 0)
6083 uint64_t ByteShift = ShiftVal / 8;
6084 unsigned NumBytes = NumSizeInBits / 8;
6085 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6086 Ops.push_back(N.getOperand(0));
6088 // Clear mask to all zeros and insert the shifted byte indices.
6089 Mask.append(NumBytes, SM_SentinelZero);
6091 if (X86ISD::VSHLI == Opcode) {
6092 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6093 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6094 Mask[i + j] = i + j - ByteShift;
6096 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6097 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6098 Mask[i + j - ByteShift] = i + j;
6102 case ISD::ZERO_EXTEND_VECTOR_INREG:
6103 case X86ISD::VZEXT: {
6104 // TODO - add support for VPMOVZX with smaller input vector types.
6105 SDValue Src = N.getOperand(0);
6106 MVT SrcVT = Src.getSimpleValueType();
6107 if (NumSizeInBits != SrcVT.getSizeInBits())
6109 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6118 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6119 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6120 SmallVectorImpl<int> &Mask) {
6121 int MaskWidth = Mask.size();
6122 SmallVector<SDValue, 16> UsedInputs;
6123 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6124 int lo = UsedInputs.size() * MaskWidth;
6125 int hi = lo + MaskWidth;
6127 // Strip UNDEF input usage.
6128 if (Inputs[i].isUndef())
6130 if ((lo <= M) && (M < hi))
6131 M = SM_SentinelUndef;
6133 // Check for unused inputs.
6134 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6135 UsedInputs.push_back(Inputs[i]);
6142 Inputs = UsedInputs;
6145 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6146 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6147 /// remaining input indices in case we now have a unary shuffle and adjust the
6148 /// inputs accordingly.
6149 /// Returns true if the target shuffle mask was decoded.
6150 static bool resolveTargetShuffleInputs(SDValue Op,
6151 SmallVectorImpl<SDValue> &Inputs,
6152 SmallVectorImpl<int> &Mask,
6153 SelectionDAG &DAG) {
6154 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6155 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6158 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6162 /// Returns the scalar element that will make up the ith
6163 /// element of the result of the vector shuffle.
6164 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6167 return SDValue(); // Limit search depth.
6169 SDValue V = SDValue(N, 0);
6170 EVT VT = V.getValueType();
6171 unsigned Opcode = V.getOpcode();
6173 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6174 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6175 int Elt = SV->getMaskElt(Index);
6178 return DAG.getUNDEF(VT.getVectorElementType());
6180 unsigned NumElems = VT.getVectorNumElements();
6181 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6182 : SV->getOperand(1);
6183 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6186 // Recurse into target specific vector shuffles to find scalars.
6187 if (isTargetShuffle(Opcode)) {
6188 MVT ShufVT = V.getSimpleValueType();
6189 MVT ShufSVT = ShufVT.getVectorElementType();
6190 int NumElems = (int)ShufVT.getVectorNumElements();
6191 SmallVector<int, 16> ShuffleMask;
6192 SmallVector<SDValue, 16> ShuffleOps;
6195 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6198 int Elt = ShuffleMask[Index];
6199 if (Elt == SM_SentinelZero)
6200 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6201 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6202 if (Elt == SM_SentinelUndef)
6203 return DAG.getUNDEF(ShufSVT);
6205 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6206 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6207 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6211 // Actual nodes that may contain scalar elements
6212 if (Opcode == ISD::BITCAST) {
6213 V = V.getOperand(0);
6214 EVT SrcVT = V.getValueType();
6215 unsigned NumElems = VT.getVectorNumElements();
6217 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6221 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6222 return (Index == 0) ? V.getOperand(0)
6223 : DAG.getUNDEF(VT.getVectorElementType());
6225 if (V.getOpcode() == ISD::BUILD_VECTOR)
6226 return V.getOperand(Index);
6231 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6232 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6233 unsigned NumNonZero, unsigned NumZero,
6235 const X86Subtarget &Subtarget) {
6236 MVT VT = Op.getSimpleValueType();
6237 unsigned NumElts = VT.getVectorNumElements();
6238 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6239 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6240 "Illegal vector insertion");
6246 for (unsigned i = 0; i < NumElts; ++i) {
6247 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6251 // If the build vector contains zeros or our first insertion is not the
6252 // first index then insert into zero vector to break any register
6253 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6256 if (NumZero || 0 != i)
6257 V = getZeroVector(VT, Subtarget, DAG, dl);
6259 assert(0 == i && "Expected insertion into zero-index");
6260 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6261 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6262 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6263 V = DAG.getBitcast(VT, V);
6267 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6268 DAG.getIntPtrConstant(i, dl));
6274 /// Custom lower build_vector of v16i8.
6275 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6276 unsigned NumNonZero, unsigned NumZero,
6278 const X86Subtarget &Subtarget) {
6279 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6282 // SSE4.1 - use PINSRB to insert each byte directly.
6283 if (Subtarget.hasSSE41())
6284 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6291 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6292 for (unsigned i = 0; i < 16; ++i) {
6293 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6294 if (ThisIsNonZero && First) {
6296 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6298 V = DAG.getUNDEF(MVT::v8i16);
6303 // FIXME: Investigate extending to i32 instead of just i16.
6304 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6305 SDValue ThisElt, LastElt;
6306 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6307 if (LastIsNonZero) {
6309 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6311 if (ThisIsNonZero) {
6312 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6313 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6314 DAG.getConstant(8, dl, MVT::i8));
6316 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6322 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6323 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6324 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6325 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6326 V = DAG.getBitcast(MVT::v8i16, V);
6328 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6329 DAG.getIntPtrConstant(i / 2, dl));
6335 return DAG.getBitcast(MVT::v16i8, V);
6338 /// Custom lower build_vector of v8i16.
6339 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6340 unsigned NumNonZero, unsigned NumZero,
6342 const X86Subtarget &Subtarget) {
6343 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6346 // Use PINSRW to insert each byte directly.
6347 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6351 /// Custom lower build_vector of v4i32 or v4f32.
6352 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6353 const X86Subtarget &Subtarget) {
6354 // Find all zeroable elements.
6355 std::bitset<4> Zeroable;
6356 for (int i=0; i < 4; ++i) {
6357 SDValue Elt = Op->getOperand(i);
6358 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6360 assert(Zeroable.size() - Zeroable.count() > 1 &&
6361 "We expect at least two non-zero elements!");
6363 // We only know how to deal with build_vector nodes where elements are either
6364 // zeroable or extract_vector_elt with constant index.
6365 SDValue FirstNonZero;
6366 unsigned FirstNonZeroIdx;
6367 for (unsigned i=0; i < 4; ++i) {
6370 SDValue Elt = Op->getOperand(i);
6371 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6372 !isa<ConstantSDNode>(Elt.getOperand(1)))
6374 // Make sure that this node is extracting from a 128-bit vector.
6375 MVT VT = Elt.getOperand(0).getSimpleValueType();
6376 if (!VT.is128BitVector())
6378 if (!FirstNonZero.getNode()) {
6380 FirstNonZeroIdx = i;
6384 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6385 SDValue V1 = FirstNonZero.getOperand(0);
6386 MVT VT = V1.getSimpleValueType();
6388 // See if this build_vector can be lowered as a blend with zero.
6390 unsigned EltMaskIdx, EltIdx;
6392 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6393 if (Zeroable[EltIdx]) {
6394 // The zero vector will be on the right hand side.
6395 Mask[EltIdx] = EltIdx+4;
6399 Elt = Op->getOperand(EltIdx);
6400 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6401 EltMaskIdx = Elt.getConstantOperandVal(1);
6402 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6404 Mask[EltIdx] = EltIdx;
6408 // Let the shuffle legalizer deal with blend operations.
6409 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6410 if (V1.getSimpleValueType() != VT)
6411 V1 = DAG.getBitcast(VT, V1);
6412 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6415 // See if we can lower this build_vector to a INSERTPS.
6416 if (!Subtarget.hasSSE41())
6419 SDValue V2 = Elt.getOperand(0);
6420 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6423 bool CanFold = true;
6424 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6428 SDValue Current = Op->getOperand(i);
6429 SDValue SrcVector = Current->getOperand(0);
6432 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6438 assert(V1.getNode() && "Expected at least two non-zero elements!");
6439 if (V1.getSimpleValueType() != MVT::v4f32)
6440 V1 = DAG.getBitcast(MVT::v4f32, V1);
6441 if (V2.getSimpleValueType() != MVT::v4f32)
6442 V2 = DAG.getBitcast(MVT::v4f32, V2);
6444 // Ok, we can emit an INSERTPS instruction.
6445 unsigned ZMask = Zeroable.to_ulong();
6447 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6448 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6450 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6451 DAG.getIntPtrConstant(InsertPSMask, DL));
6452 return DAG.getBitcast(VT, Result);
6455 /// Return a vector logical shift node.
6456 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6457 SelectionDAG &DAG, const TargetLowering &TLI,
6459 assert(VT.is128BitVector() && "Unknown type for VShift");
6460 MVT ShVT = MVT::v16i8;
6461 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6462 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6463 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6464 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6465 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6466 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6469 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6470 SelectionDAG &DAG) {
6472 // Check if the scalar load can be widened into a vector load. And if
6473 // the address is "base + cst" see if the cst can be "absorbed" into
6474 // the shuffle mask.
6475 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6476 SDValue Ptr = LD->getBasePtr();
6477 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6479 EVT PVT = LD->getValueType(0);
6480 if (PVT != MVT::i32 && PVT != MVT::f32)
6485 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6486 FI = FINode->getIndex();
6488 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6489 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6490 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6491 Offset = Ptr.getConstantOperandVal(1);
6492 Ptr = Ptr.getOperand(0);
6497 // FIXME: 256-bit vector instructions don't require a strict alignment,
6498 // improve this code to support it better.
6499 unsigned RequiredAlign = VT.getSizeInBits()/8;
6500 SDValue Chain = LD->getChain();
6501 // Make sure the stack object alignment is at least 16 or 32.
6502 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6503 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6504 if (MFI.isFixedObjectIndex(FI)) {
6505 // Can't change the alignment. FIXME: It's possible to compute
6506 // the exact stack offset and reference FI + adjust offset instead.
6507 // If someone *really* cares about this. That's the way to implement it.
6510 MFI.setObjectAlignment(FI, RequiredAlign);
6514 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6515 // Ptr + (Offset & ~15).
6518 if ((Offset % RequiredAlign) & 3)
6520 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6523 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6524 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6527 int EltNo = (Offset - StartOffset) >> 2;
6528 unsigned NumElems = VT.getVectorNumElements();
6530 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6531 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6532 LD->getPointerInfo().getWithOffset(StartOffset));
6534 SmallVector<int, 8> Mask(NumElems, EltNo);
6536 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6542 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6543 /// elements can be replaced by a single large load which has the same value as
6544 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6546 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6547 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6548 const SDLoc &DL, SelectionDAG &DAG,
6549 const X86Subtarget &Subtarget,
6550 bool isAfterLegalize) {
6551 unsigned NumElems = Elts.size();
6553 int LastLoadedElt = -1;
6554 SmallBitVector LoadMask(NumElems, false);
6555 SmallBitVector ZeroMask(NumElems, false);
6556 SmallBitVector UndefMask(NumElems, false);
6558 // For each element in the initializer, see if we've found a load, zero or an
6560 for (unsigned i = 0; i < NumElems; ++i) {
6561 SDValue Elt = peekThroughBitcasts(Elts[i]);
6566 UndefMask[i] = true;
6567 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6569 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6572 // Each loaded element must be the correct fractional portion of the
6573 // requested vector load.
6574 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6579 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6580 "Incomplete element masks");
6582 // Handle Special Cases - all undef or undef/zero.
6583 if (UndefMask.count() == NumElems)
6584 return DAG.getUNDEF(VT);
6586 // FIXME: Should we return this as a BUILD_VECTOR instead?
6587 if ((ZeroMask | UndefMask).count() == NumElems)
6588 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6589 : DAG.getConstantFP(0.0, DL, VT);
6591 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6592 int FirstLoadedElt = LoadMask.find_first();
6593 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6594 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6595 EVT LDBaseVT = EltBase.getValueType();
6597 // Consecutive loads can contain UNDEFS but not ZERO elements.
6598 // Consecutive loads with UNDEFs and ZEROs elements require a
6599 // an additional shuffle stage to clear the ZERO elements.
6600 bool IsConsecutiveLoad = true;
6601 bool IsConsecutiveLoadWithZeros = true;
6602 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6604 SDValue Elt = peekThroughBitcasts(Elts[i]);
6605 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6606 if (!DAG.areNonVolatileConsecutiveLoads(
6607 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6608 i - FirstLoadedElt)) {
6609 IsConsecutiveLoad = false;
6610 IsConsecutiveLoadWithZeros = false;
6613 } else if (ZeroMask[i]) {
6614 IsConsecutiveLoad = false;
6618 SmallVector<LoadSDNode *, 8> Loads;
6619 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6621 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6623 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6624 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6625 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6626 "Cannot merge volatile loads.");
6628 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6629 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6630 for (auto *LD : Loads)
6631 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6635 // LOAD - all consecutive load/undefs (must start/end with a load).
6636 // If we have found an entire vector of loads and undefs, then return a large
6637 // load of the entire vector width starting at the base pointer.
6638 // If the vector contains zeros, then attempt to shuffle those elements.
6639 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6640 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6641 assert(LDBase && "Did not find base load for merging consecutive loads");
6642 EVT EltVT = LDBase->getValueType(0);
6643 // Ensure that the input vector size for the merged loads matches the
6644 // cumulative size of the input elements.
6645 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6648 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6651 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6652 // will lower to regular temporal loads and use the cache.
6653 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6654 VT.is256BitVector() && !Subtarget.hasInt256())
6657 if (IsConsecutiveLoad)
6658 return CreateLoad(VT, LDBase);
6660 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6661 // vector and a zero vector to clear out the zero elements.
6662 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6663 SmallVector<int, 4> ClearMask(NumElems, -1);
6664 for (unsigned i = 0; i < NumElems; ++i) {
6666 ClearMask[i] = i + NumElems;
6667 else if (LoadMask[i])
6670 SDValue V = CreateLoad(VT, LDBase);
6671 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6672 : DAG.getConstantFP(0.0, DL, VT);
6673 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6678 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6680 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6681 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6682 (LoadSize == 32 || LoadSize == 64) &&
6683 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6684 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6685 : MVT::getIntegerVT(LoadSize);
6686 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6687 if (TLI.isTypeLegal(VecVT)) {
6688 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6689 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6691 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6692 LDBase->getPointerInfo(),
6693 LDBase->getAlignment(),
6694 MachineMemOperand::MOLoad);
6695 for (auto *LD : Loads)
6696 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6697 return DAG.getBitcast(VT, ResNode);
6704 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6705 unsigned SplatBitSize, LLVMContext &C) {
6706 unsigned ScalarSize = VT.getScalarSizeInBits();
6707 unsigned NumElm = SplatBitSize / ScalarSize;
6709 SmallVector<Constant *, 32> ConstantVec;
6710 for (unsigned i = 0; i < NumElm; i++) {
6711 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6713 if (VT.isFloatingPoint()) {
6714 if (ScalarSize == 32) {
6715 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6717 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6718 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6721 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6722 ConstantVec.push_back(Const);
6724 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6727 static bool isUseOfShuffle(SDNode *N) {
6728 for (auto *U : N->uses()) {
6729 if (isTargetShuffle(U->getOpcode()))
6731 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6732 return isUseOfShuffle(U);
6737 // Check if the current node of build vector is a zero extended vector.
6738 // // If so, return the value extended.
6739 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6740 // // NumElt - return the number of zero extended identical values.
6741 // // EltType - return the type of the value include the zero extend.
6742 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6743 unsigned &NumElt, MVT &EltType) {
6744 SDValue ExtValue = Op->getOperand(0);
6745 unsigned NumElts = Op->getNumOperands();
6746 unsigned Delta = NumElts;
6748 for (unsigned i = 1; i < NumElts; i++) {
6749 if (Op->getOperand(i) == ExtValue) {
6753 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6756 if (!isPowerOf2_32(Delta) || Delta == 1)
6759 for (unsigned i = Delta; i < NumElts; i++) {
6760 if (i % Delta == 0) {
6761 if (Op->getOperand(i) != ExtValue)
6763 } else if (!(isNullConstant(Op->getOperand(i)) ||
6764 Op->getOperand(i).isUndef()))
6767 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6768 unsigned ExtVTSize = EltSize * Delta;
6769 EltType = MVT::getIntegerVT(ExtVTSize);
6770 NumElt = NumElts / Delta;
6774 /// Attempt to use the vbroadcast instruction to generate a splat value
6775 /// from a splat BUILD_VECTOR which uses:
6776 /// a. A single scalar load, or a constant.
6777 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6779 /// The VBROADCAST node is returned when a pattern is found,
6780 /// or SDValue() otherwise.
6781 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6782 const X86Subtarget &Subtarget,
6783 SelectionDAG &DAG) {
6784 // VBROADCAST requires AVX.
6785 // TODO: Splats could be generated for non-AVX CPUs using SSE
6786 // instructions, but there's less potential gain for only 128-bit vectors.
6787 if (!Subtarget.hasAVX())
6790 MVT VT = BVOp->getSimpleValueType(0);
6793 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6794 "Unsupported vector type for broadcast.");
6796 BitVector UndefElements;
6797 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6799 // Attempt to use VBROADCASTM
6800 // From this paterrn:
6801 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
6802 // b. t1 = (build_vector t0 t0)
6804 // Create (VBROADCASTM v2i1 X)
6805 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
6806 MVT EltType = VT.getScalarType();
6807 unsigned NumElts = VT.getVectorNumElements();
6809 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
6810 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
6811 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
6812 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
6814 BOperand = ZeroExtended.getOperand(0);
6816 BOperand = Ld.getOperand(0).getOperand(0);
6817 if (BOperand.getValueType().isVector() &&
6818 BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
6819 if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
6820 NumElts == 8)) || // for broadcastmb2q
6821 (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
6822 NumElts == 16))) { // for broadcastmw2d
6824 DAG.getNode(X86ISD::VBROADCASTM, dl,
6825 MVT::getVectorVT(EltType, NumElts), BOperand);
6826 return DAG.getBitcast(VT, Brdcst);
6832 // We need a splat of a single value to use broadcast, and it doesn't
6833 // make any sense if the value is only in one element of the vector.
6834 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6835 APInt SplatValue, Undef;
6836 unsigned SplatBitSize;
6838 // Check if this is a repeated constant pattern suitable for broadcasting.
6839 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6840 SplatBitSize > VT.getScalarSizeInBits() &&
6841 SplatBitSize < VT.getSizeInBits()) {
6842 // Avoid replacing with broadcast when it's a use of a shuffle
6843 // instruction to preserve the present custom lowering of shuffles.
6844 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6846 // replace BUILD_VECTOR with broadcast of the repeated constants.
6847 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6848 LLVMContext *Ctx = DAG.getContext();
6849 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6850 if (Subtarget.hasAVX()) {
6851 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6852 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6853 // Splatted value can fit in one INTEGER constant in constant pool.
6854 // Load the constant and broadcast it.
6855 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6856 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6857 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6858 SDValue CP = DAG.getConstantPool(C, PVT);
6859 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6861 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6863 CVT, dl, DAG.getEntryNode(), CP,
6864 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6866 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6867 MVT::getVectorVT(CVT, Repeat), Ld);
6868 return DAG.getBitcast(VT, Brdcst);
6869 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6870 // Splatted value can fit in one FLOAT constant in constant pool.
6871 // Load the constant and broadcast it.
6872 // AVX have support for 32 and 64 bit broadcast for floats only.
6873 // No 64bit integer in 32bit subtarget.
6874 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6875 // Lower the splat via APFloat directly, to avoid any conversion.
6878 ? ConstantFP::get(*Ctx,
6879 APFloat(APFloat::IEEEsingle(), SplatValue))
6880 : ConstantFP::get(*Ctx,
6881 APFloat(APFloat::IEEEdouble(), SplatValue));
6882 SDValue CP = DAG.getConstantPool(C, PVT);
6883 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6885 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6887 CVT, dl, DAG.getEntryNode(), CP,
6888 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6890 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6891 MVT::getVectorVT(CVT, Repeat), Ld);
6892 return DAG.getBitcast(VT, Brdcst);
6893 } else if (SplatBitSize > 64) {
6894 // Load the vector of constants and broadcast it.
6895 MVT CVT = VT.getScalarType();
6896 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6898 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6899 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6900 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6902 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6903 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6905 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6906 return DAG.getBitcast(VT, Brdcst);
6913 bool ConstSplatVal =
6914 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6916 // Make sure that all of the users of a non-constant load are from the
6917 // BUILD_VECTOR node.
6918 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6921 unsigned ScalarSize = Ld.getValueSizeInBits();
6922 bool IsGE256 = (VT.getSizeInBits() >= 256);
6924 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6925 // instruction to save 8 or more bytes of constant pool data.
6926 // TODO: If multiple splats are generated to load the same constant,
6927 // it may be detrimental to overall size. There needs to be a way to detect
6928 // that condition to know if this is truly a size win.
6929 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
6931 // Handle broadcasting a single constant scalar from the constant pool
6933 // On Sandybridge (no AVX2), it is still better to load a constant vector
6934 // from the constant pool and not to broadcast it from a scalar.
6935 // But override that restriction when optimizing for size.
6936 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6937 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6938 EVT CVT = Ld.getValueType();
6939 assert(!CVT.isVector() && "Must not broadcast a vector type");
6941 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6942 // For size optimization, also splat v2f64 and v2i64, and for size opt
6943 // with AVX2, also splat i8 and i16.
6944 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6945 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6946 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6947 const Constant *C = nullptr;
6948 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6949 C = CI->getConstantIntValue();
6950 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6951 C = CF->getConstantFPValue();
6953 assert(C && "Invalid constant type");
6955 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6957 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6958 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6960 CVT, dl, DAG.getEntryNode(), CP,
6961 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6964 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6968 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6970 // Handle AVX2 in-register broadcasts.
6971 if (!IsLoad && Subtarget.hasInt256() &&
6972 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6973 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6975 // The scalar source must be a normal load.
6979 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6980 (Subtarget.hasVLX() && ScalarSize == 64))
6981 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6983 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6984 // double since there is no vbroadcastsd xmm
6985 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6986 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6987 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6990 // Unsupported broadcast.
6994 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6995 /// underlying vector and index.
6997 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6999 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7001 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7002 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7005 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7007 // (extract_vector_elt (v8f32 %1), Constant<6>)
7009 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7010 // (extract_subvector (v8f32 %0), Constant<4>),
7013 // In this case the vector is the extract_subvector expression and the index
7014 // is 2, as specified by the shuffle.
7015 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7016 SDValue ShuffleVec = SVOp->getOperand(0);
7017 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7018 assert(ShuffleVecVT.getVectorElementType() ==
7019 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7021 int ShuffleIdx = SVOp->getMaskElt(Idx);
7022 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7023 ExtractedFromVec = ShuffleVec;
7029 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7030 MVT VT = Op.getSimpleValueType();
7032 // Skip if insert_vec_elt is not supported.
7033 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7034 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7038 unsigned NumElems = Op.getNumOperands();
7042 SmallVector<unsigned, 4> InsertIndices;
7043 SmallVector<int, 8> Mask(NumElems, -1);
7045 for (unsigned i = 0; i != NumElems; ++i) {
7046 unsigned Opc = Op.getOperand(i).getOpcode();
7048 if (Opc == ISD::UNDEF)
7051 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7052 // Quit if more than 1 elements need inserting.
7053 if (InsertIndices.size() > 1)
7056 InsertIndices.push_back(i);
7060 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7061 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7063 // Quit if non-constant index.
7064 if (!isa<ConstantSDNode>(ExtIdx))
7066 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7068 // Quit if extracted from vector of different type.
7069 if (ExtractedFromVec.getValueType() != VT)
7072 if (!VecIn1.getNode())
7073 VecIn1 = ExtractedFromVec;
7074 else if (VecIn1 != ExtractedFromVec) {
7075 if (!VecIn2.getNode())
7076 VecIn2 = ExtractedFromVec;
7077 else if (VecIn2 != ExtractedFromVec)
7078 // Quit if more than 2 vectors to shuffle
7082 if (ExtractedFromVec == VecIn1)
7084 else if (ExtractedFromVec == VecIn2)
7085 Mask[i] = Idx + NumElems;
7088 if (!VecIn1.getNode())
7091 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7092 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7094 for (unsigned Idx : InsertIndices)
7095 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7096 DAG.getIntPtrConstant(Idx, DL));
7101 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7102 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7103 Op.getScalarValueSizeInBits() == 1 &&
7104 "Can not convert non-constant vector");
7105 uint64_t Immediate = 0;
7106 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7107 SDValue In = Op.getOperand(idx);
7109 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7112 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7113 return DAG.getConstant(Immediate, dl, VT);
7115 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7117 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
7119 MVT VT = Op.getSimpleValueType();
7120 assert((VT.getVectorElementType() == MVT::i1) &&
7121 "Unexpected type in LowerBUILD_VECTORvXi1!");
7124 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7127 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7130 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7131 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7132 // Split the pieces.
7134 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7136 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7137 // We have to manually lower both halves so getNode doesn't try to
7138 // reassemble the build_vector.
7139 Lower = LowerBUILD_VECTORvXi1(Lower, DAG);
7140 Upper = LowerBUILD_VECTORvXi1(Upper, DAG);
7141 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7143 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7144 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7145 return DAG.getBitcast(VT, Imm);
7146 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7147 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7148 DAG.getIntPtrConstant(0, dl));
7151 // Vector has one or more non-const elements
7152 uint64_t Immediate = 0;
7153 SmallVector<unsigned, 16> NonConstIdx;
7154 bool IsSplat = true;
7155 bool HasConstElts = false;
7157 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7158 SDValue In = Op.getOperand(idx);
7161 if (!isa<ConstantSDNode>(In))
7162 NonConstIdx.push_back(idx);
7164 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7165 HasConstElts = true;
7169 else if (In != Op.getOperand(SplatIdx))
7173 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7175 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7176 DAG.getConstant(1, dl, VT),
7177 DAG.getConstant(0, dl, VT));
7179 // insert elements one by one
7183 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7184 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7186 else if (HasConstElts)
7187 Imm = DAG.getConstant(0, dl, VT);
7189 Imm = DAG.getUNDEF(VT);
7190 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7191 DstVec = DAG.getBitcast(VT, Imm);
7193 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7194 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7195 DAG.getIntPtrConstant(0, dl));
7198 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7199 unsigned InsertIdx = NonConstIdx[i];
7200 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7201 Op.getOperand(InsertIdx),
7202 DAG.getIntPtrConstant(InsertIdx, dl));
7207 /// \brief Return true if \p N implements a horizontal binop and return the
7208 /// operands for the horizontal binop into V0 and V1.
7210 /// This is a helper function of LowerToHorizontalOp().
7211 /// This function checks that the build_vector \p N in input implements a
7212 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7213 /// operation to match.
7214 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7215 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7216 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7219 /// This function only analyzes elements of \p N whose indices are
7220 /// in range [BaseIdx, LastIdx).
7221 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7223 unsigned BaseIdx, unsigned LastIdx,
7224 SDValue &V0, SDValue &V1) {
7225 EVT VT = N->getValueType(0);
7227 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7228 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7229 "Invalid Vector in input!");
7231 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7232 bool CanFold = true;
7233 unsigned ExpectedVExtractIdx = BaseIdx;
7234 unsigned NumElts = LastIdx - BaseIdx;
7235 V0 = DAG.getUNDEF(VT);
7236 V1 = DAG.getUNDEF(VT);
7238 // Check if N implements a horizontal binop.
7239 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7240 SDValue Op = N->getOperand(i + BaseIdx);
7243 if (Op->isUndef()) {
7244 // Update the expected vector extract index.
7245 if (i * 2 == NumElts)
7246 ExpectedVExtractIdx = BaseIdx;
7247 ExpectedVExtractIdx += 2;
7251 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7256 SDValue Op0 = Op.getOperand(0);
7257 SDValue Op1 = Op.getOperand(1);
7259 // Try to match the following pattern:
7260 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7261 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7262 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7263 Op0.getOperand(0) == Op1.getOperand(0) &&
7264 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7265 isa<ConstantSDNode>(Op1.getOperand(1)));
7269 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7270 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7272 if (i * 2 < NumElts) {
7274 V0 = Op0.getOperand(0);
7275 if (V0.getValueType() != VT)
7280 V1 = Op0.getOperand(0);
7281 if (V1.getValueType() != VT)
7284 if (i * 2 == NumElts)
7285 ExpectedVExtractIdx = BaseIdx;
7288 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7289 if (I0 == ExpectedVExtractIdx)
7290 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7291 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7292 // Try to match the following dag sequence:
7293 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7294 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7298 ExpectedVExtractIdx += 2;
7304 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7305 /// a concat_vector.
7307 /// This is a helper function of LowerToHorizontalOp().
7308 /// This function expects two 256-bit vectors called V0 and V1.
7309 /// At first, each vector is split into two separate 128-bit vectors.
7310 /// Then, the resulting 128-bit vectors are used to implement two
7311 /// horizontal binary operations.
7313 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7315 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7316 /// the two new horizontal binop.
7317 /// When Mode is set, the first horizontal binop dag node would take as input
7318 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7319 /// horizontal binop dag node would take as input the lower 128-bit of V1
7320 /// and the upper 128-bit of V1.
7322 /// HADD V0_LO, V0_HI
7323 /// HADD V1_LO, V1_HI
7325 /// Otherwise, the first horizontal binop dag node takes as input the lower
7326 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7327 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7329 /// HADD V0_LO, V1_LO
7330 /// HADD V0_HI, V1_HI
7332 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7333 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7334 /// the upper 128-bits of the result.
7335 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7336 const SDLoc &DL, SelectionDAG &DAG,
7337 unsigned X86Opcode, bool Mode,
7338 bool isUndefLO, bool isUndefHI) {
7339 MVT VT = V0.getSimpleValueType();
7340 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7341 "Invalid nodes in input!");
7343 unsigned NumElts = VT.getVectorNumElements();
7344 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7345 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7346 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7347 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7348 MVT NewVT = V0_LO.getSimpleValueType();
7350 SDValue LO = DAG.getUNDEF(NewVT);
7351 SDValue HI = DAG.getUNDEF(NewVT);
7354 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7355 if (!isUndefLO && !V0->isUndef())
7356 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7357 if (!isUndefHI && !V1->isUndef())
7358 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7360 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7361 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7362 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7364 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7365 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7368 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7371 /// Returns true iff \p BV builds a vector with the result equivalent to
7372 /// the result of ADDSUB operation.
7373 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7374 /// are written to the parameters \p Opnd0 and \p Opnd1.
7375 static bool isAddSub(const BuildVectorSDNode *BV,
7376 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7377 SDValue &Opnd0, SDValue &Opnd1,
7378 unsigned &NumExtracts) {
7380 MVT VT = BV->getSimpleValueType(0);
7381 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7382 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7383 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7386 unsigned NumElts = VT.getVectorNumElements();
7387 SDValue InVec0 = DAG.getUNDEF(VT);
7388 SDValue InVec1 = DAG.getUNDEF(VT);
7392 // Odd-numbered elements in the input build vector are obtained from
7393 // adding two integer/float elements.
7394 // Even-numbered elements in the input build vector are obtained from
7395 // subtracting two integer/float elements.
7396 unsigned ExpectedOpcode = ISD::FSUB;
7397 unsigned NextExpectedOpcode = ISD::FADD;
7398 bool AddFound = false;
7399 bool SubFound = false;
7401 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7402 SDValue Op = BV->getOperand(i);
7404 // Skip 'undef' values.
7405 unsigned Opcode = Op.getOpcode();
7406 if (Opcode == ISD::UNDEF) {
7407 std::swap(ExpectedOpcode, NextExpectedOpcode);
7411 // Early exit if we found an unexpected opcode.
7412 if (Opcode != ExpectedOpcode)
7415 SDValue Op0 = Op.getOperand(0);
7416 SDValue Op1 = Op.getOperand(1);
7418 // Try to match the following pattern:
7419 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7420 // Early exit if we cannot match that sequence.
7421 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7422 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7423 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7424 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7425 Op0.getOperand(1) != Op1.getOperand(1))
7428 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7432 // We found a valid add/sub node. Update the information accordingly.
7438 // Update InVec0 and InVec1.
7439 if (InVec0.isUndef()) {
7440 InVec0 = Op0.getOperand(0);
7441 if (InVec0.getSimpleValueType() != VT)
7444 if (InVec1.isUndef()) {
7445 InVec1 = Op1.getOperand(0);
7446 if (InVec1.getSimpleValueType() != VT)
7450 // Make sure that operands in input to each add/sub node always
7451 // come from a same pair of vectors.
7452 if (InVec0 != Op0.getOperand(0)) {
7453 if (ExpectedOpcode == ISD::FSUB)
7456 // FADD is commutable. Try to commute the operands
7457 // and then test again.
7458 std::swap(Op0, Op1);
7459 if (InVec0 != Op0.getOperand(0))
7463 if (InVec1 != Op1.getOperand(0))
7466 // Update the pair of expected opcodes.
7467 std::swap(ExpectedOpcode, NextExpectedOpcode);
7469 // Increment the number of extractions done.
7473 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7474 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7482 /// Returns true if is possible to fold MUL and an idiom that has already been
7483 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7484 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7485 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7487 /// Prior to calling this function it should be known that there is some
7488 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7489 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7490 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7491 /// of \p Opnd0 uses is expected to be equal to 2.
7492 /// For example, this function may be called for the following IR:
7493 /// %AB = fmul fast <2 x double> %A, %B
7494 /// %Sub = fsub fast <2 x double> %AB, %C
7495 /// %Add = fadd fast <2 x double> %AB, %C
7496 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7497 /// <2 x i32> <i32 0, i32 3>
7498 /// There is a def for %Addsub here, which potentially can be replaced by
7499 /// X86ISD::ADDSUB operation:
7500 /// %Addsub = X86ISD::ADDSUB %AB, %C
7501 /// and such ADDSUB can further be replaced with FMADDSUB:
7502 /// %Addsub = FMADDSUB %A, %B, %C.
7504 /// The main reason why this method is called before the replacement of the
7505 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7506 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7508 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7509 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7510 unsigned ExpectedUses) {
7511 if (Opnd0.getOpcode() != ISD::FMUL ||
7512 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) ||
7513 !Subtarget.hasAnyFMA())
7516 // FIXME: These checks must match the similar ones in
7517 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7518 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7519 // or MUL + ADDSUB to FMADDSUB.
7520 const TargetOptions &Options = DAG.getTarget().Options;
7522 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7527 Opnd1 = Opnd0.getOperand(1);
7528 Opnd0 = Opnd0.getOperand(0);
7533 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7534 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7535 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7536 const X86Subtarget &Subtarget,
7537 SelectionDAG &DAG) {
7538 SDValue Opnd0, Opnd1;
7539 unsigned NumExtracts;
7540 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
7543 MVT VT = BV->getSimpleValueType(0);
7546 // Try to generate X86ISD::FMADDSUB node here.
7548 // TODO: According to coverage reports, the FMADDSUB transform is not
7549 // triggered by any tests.
7550 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
7551 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7553 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7554 // the ADDSUB idiom has been successfully recognized. There are no known
7555 // X86 targets with 512-bit ADDSUB instructions!
7556 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7558 if (VT.is512BitVector())
7561 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7564 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7565 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7566 const X86Subtarget &Subtarget,
7567 SelectionDAG &DAG) {
7568 MVT VT = BV->getSimpleValueType(0);
7569 unsigned NumElts = VT.getVectorNumElements();
7570 unsigned NumUndefsLO = 0;
7571 unsigned NumUndefsHI = 0;
7572 unsigned Half = NumElts/2;
7574 // Count the number of UNDEF operands in the build_vector in input.
7575 for (unsigned i = 0, e = Half; i != e; ++i)
7576 if (BV->getOperand(i)->isUndef())
7579 for (unsigned i = Half, e = NumElts; i != e; ++i)
7580 if (BV->getOperand(i)->isUndef())
7583 // Early exit if this is either a build_vector of all UNDEFs or all the
7584 // operands but one are UNDEF.
7585 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7589 SDValue InVec0, InVec1;
7590 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7591 // Try to match an SSE3 float HADD/HSUB.
7592 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7593 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7595 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7596 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7597 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7598 // Try to match an SSSE3 integer HADD/HSUB.
7599 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7600 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7602 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7603 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7606 if (!Subtarget.hasAVX())
7609 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7610 // Try to match an AVX horizontal add/sub of packed single/double
7611 // precision floating point values from 256-bit vectors.
7612 SDValue InVec2, InVec3;
7613 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7614 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7615 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7616 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7617 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7619 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7620 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7621 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7622 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7623 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7624 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7625 // Try to match an AVX2 horizontal add/sub of signed integers.
7626 SDValue InVec2, InVec3;
7628 bool CanFold = true;
7630 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7631 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7632 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7633 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7634 X86Opcode = X86ISD::HADD;
7635 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7636 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7637 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7638 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7639 X86Opcode = X86ISD::HSUB;
7644 // Fold this build_vector into a single horizontal add/sub.
7645 // Do this only if the target has AVX2.
7646 if (Subtarget.hasAVX2())
7647 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7649 // Do not try to expand this build_vector into a pair of horizontal
7650 // add/sub if we can emit a pair of scalar add/sub.
7651 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7654 // Convert this build_vector into a pair of horizontal binop followed by
7656 bool isUndefLO = NumUndefsLO == Half;
7657 bool isUndefHI = NumUndefsHI == Half;
7658 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7659 isUndefLO, isUndefHI);
7663 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7664 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7666 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7667 X86Opcode = X86ISD::HADD;
7668 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7669 X86Opcode = X86ISD::HSUB;
7670 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7671 X86Opcode = X86ISD::FHADD;
7672 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7673 X86Opcode = X86ISD::FHSUB;
7677 // Don't try to expand this build_vector into a pair of horizontal add/sub
7678 // if we can simply emit a pair of scalar add/sub.
7679 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7682 // Convert this build_vector into two horizontal add/sub followed by
7684 bool isUndefLO = NumUndefsLO == Half;
7685 bool isUndefHI = NumUndefsHI == Half;
7686 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7687 isUndefLO, isUndefHI);
7693 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7694 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7695 /// just apply the bit to the vectors.
7696 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7697 /// from this, but enough scalar bit operations are created from the later
7698 /// legalization + scalarization stages to need basic support.
7699 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7700 SelectionDAG &DAG) {
7702 MVT VT = Op->getSimpleValueType(0);
7703 unsigned NumElems = VT.getVectorNumElements();
7704 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7706 // Check that all elements have the same opcode.
7707 // TODO: Should we allow UNDEFS and if so how many?
7708 unsigned Opcode = Op->getOperand(0).getOpcode();
7709 for (unsigned i = 1; i < NumElems; ++i)
7710 if (Opcode != Op->getOperand(i).getOpcode())
7713 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7720 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7725 SmallVector<SDValue, 4> LHSElts, RHSElts;
7726 for (SDValue Elt : Op->ops()) {
7727 SDValue LHS = Elt.getOperand(0);
7728 SDValue RHS = Elt.getOperand(1);
7730 // We expect the canonicalized RHS operand to be the constant.
7731 if (!isa<ConstantSDNode>(RHS))
7733 LHSElts.push_back(LHS);
7734 RHSElts.push_back(RHS);
7737 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7738 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7739 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7742 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7743 /// functionality to do this, so it's all zeros, all ones, or some derivation
7744 /// that is cheap to calculate.
7745 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7746 const X86Subtarget &Subtarget) {
7748 MVT VT = Op.getSimpleValueType();
7750 // Vectors containing all zeros can be matched by pxor and xorps.
7751 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7752 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7753 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7754 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7757 return getZeroVector(VT, Subtarget, DAG, DL);
7760 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7761 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7762 // vpcmpeqd on 256-bit vectors.
7763 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7764 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7765 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7768 return getOnesVector(VT, DAG, DL);
7774 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
7775 // reasoned to be a permutation of a vector by indices in a non-constant vector.
7776 // (build_vector (extract_elt V, (extract_elt I, 0)),
7777 // (extract_elt V, (extract_elt I, 1)),
7782 // TODO: Handle undefs
7783 // TODO: Utilize pshufb and zero mask blending to support more efficient
7784 // construction of vectors with constant-0 elements.
7785 // TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
7786 // when no native operation available.
7788 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
7789 const X86Subtarget &Subtarget) {
7790 // Look for VPERMV and PSHUFB opportunities.
7791 MVT VT = V.getSimpleValueType();
7792 switch (VT.SimpleTy) {
7796 if (!Subtarget.hasSSE3())
7801 if (!Subtarget.hasAVX2())
7806 if (!Subtarget.hasVLX())
7813 if (!Subtarget.hasAVX512())
7817 if (!Subtarget.hasBWI())
7822 if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
7826 if (!Subtarget.hasVBMI())
7830 if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
7834 SDValue SrcVec, IndicesVec;
7835 // Check for a match of the permute source vector and permute index elements.
7836 // This is done by checking that the i-th build_vector operand is of the form:
7837 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
7838 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
7839 SDValue Op = V.getOperand(Idx);
7840 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7843 // If this is the first extract encountered in V, set the source vector,
7844 // otherwise verify the extract is from the previously defined source
7847 SrcVec = Op.getOperand(0);
7848 else if (SrcVec != Op.getOperand(0))
7850 SDValue ExtractedIndex = Op->getOperand(1);
7851 // Peek through extends.
7852 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
7853 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
7854 ExtractedIndex = ExtractedIndex.getOperand(0);
7855 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7858 // If this is the first extract from the index vector candidate, set the
7859 // indices vector, otherwise verify the extract is from the previously
7860 // defined indices vector.
7862 IndicesVec = ExtractedIndex.getOperand(0);
7863 else if (IndicesVec != ExtractedIndex.getOperand(0))
7866 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
7867 if (!PermIdx || PermIdx->getZExtValue() != Idx)
7871 if (VT.isFloatingPoint())
7872 IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
7873 VT.getVectorNumElements());
7874 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
7875 return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
7876 SDLoc(V), VT, IndicesVec, SrcVec);
7880 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7883 MVT VT = Op.getSimpleValueType();
7884 MVT ExtVT = VT.getVectorElementType();
7885 unsigned NumElems = Op.getNumOperands();
7887 // Generate vectors for predicate vectors.
7888 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7889 return LowerBUILD_VECTORvXi1(Op, DAG);
7891 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7892 return VectorConstant;
7894 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7895 // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
7897 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7899 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7900 return HorizontalOp;
7901 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7903 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7906 unsigned EVTBits = ExtVT.getSizeInBits();
7908 unsigned NumZero = 0;
7909 unsigned NumNonZero = 0;
7910 uint64_t NonZeros = 0;
7911 bool IsAllConstants = true;
7912 SmallSet<SDValue, 8> Values;
7913 unsigned NumConstants = NumElems;
7914 for (unsigned i = 0; i < NumElems; ++i) {
7915 SDValue Elt = Op.getOperand(i);
7919 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
7920 IsAllConstants = false;
7923 if (X86::isZeroNode(Elt))
7926 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7927 NonZeros |= ((uint64_t)1 << i);
7932 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7933 if (NumNonZero == 0)
7934 return DAG.getUNDEF(VT);
7936 // If we are inserting one variable into a vector of non-zero constants, try
7937 // to avoid loading each constant element as a scalar. Load the constants as a
7938 // vector and then insert the variable scalar element. If insertion is not
7939 // supported, we assume that we will fall back to a shuffle to get the scalar
7940 // blended with the constants. Insertion into a zero vector is handled as a
7941 // special-case somewhere below here.
7942 LLVMContext &Context = *DAG.getContext();
7943 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
7944 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
7945 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
7946 // Create an all-constant vector. The variable element in the old
7947 // build vector is replaced by undef in the constant vector. Save the
7948 // variable scalar element and its index for use in the insertelement.
7949 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
7950 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
7953 for (unsigned i = 0; i != NumElems; ++i) {
7954 SDValue Elt = Op.getOperand(i);
7955 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
7956 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
7957 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
7958 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
7959 else if (!Elt.isUndef()) {
7960 assert(!VarElt.getNode() && !InsIndex.getNode() &&
7961 "Expected one variable element in this vector");
7963 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
7966 Constant *CV = ConstantVector::get(ConstVecOps);
7967 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
7969 // The constants we just created may not be legal (eg, floating point). We
7970 // must lower the vector right here because we can not guarantee that we'll
7971 // legalize it before loading it. This is also why we could not just create
7972 // a new build vector here. If the build vector contains illegal constants,
7973 // it could get split back up into a series of insert elements.
7974 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
7975 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
7976 MachineFunction &MF = DAG.getMachineFunction();
7977 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
7978 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
7979 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
7982 // Special case for single non-zero, non-undef, element.
7983 if (NumNonZero == 1) {
7984 unsigned Idx = countTrailingZeros(NonZeros);
7985 SDValue Item = Op.getOperand(Idx);
7987 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7988 // the value are obviously zero, truncate the value to i32 and do the
7989 // insertion that way. Only do this if the value is non-constant or if the
7990 // value is a constant being inserted into element 0. It is cheaper to do
7991 // a constant pool load than it is to do a movd + shuffle.
7992 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7993 (!IsAllConstants || Idx == 0)) {
7994 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7996 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7997 MVT VecVT = MVT::v4i32;
7999 // Truncate the value (which may itself be a constant) to i32, and
8000 // convert it to a vector with movd (S2V+shuffle to zero extend).
8001 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
8002 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
8003 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
8004 Item, Idx * 2, true, Subtarget, DAG));
8008 // If we have a constant or non-constant insertion into the low element of
8009 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8010 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8011 // depending on what the source datatype is.
8014 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8016 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
8017 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
8018 assert((VT.is128BitVector() || VT.is256BitVector() ||
8019 VT.is512BitVector()) &&
8020 "Expected an SSE value type!");
8021 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8022 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8023 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8026 // We can't directly insert an i8 or i16 into a vector, so zero extend
8028 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
8029 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8030 if (VT.getSizeInBits() >= 256) {
8031 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8032 if (Subtarget.hasAVX()) {
8033 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8034 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8036 // Without AVX, we need to extend to a 128-bit vector and then
8037 // insert into the 256-bit vector.
8038 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8039 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8040 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8043 assert(VT.is128BitVector() && "Expected an SSE value type!");
8044 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8045 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8047 return DAG.getBitcast(VT, Item);
8051 // Is it a vector logical left shift?
8052 if (NumElems == 2 && Idx == 1 &&
8053 X86::isZeroNode(Op.getOperand(0)) &&
8054 !X86::isZeroNode(Op.getOperand(1))) {
8055 unsigned NumBits = VT.getSizeInBits();
8056 return getVShift(true, VT,
8057 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8058 VT, Op.getOperand(1)),
8059 NumBits/2, DAG, *this, dl);
8062 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8065 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8066 // is a non-constant being inserted into an element other than the low one,
8067 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8068 // movd/movss) to move this into the low element, then shuffle it into
8070 if (EVTBits == 32) {
8071 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8072 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8076 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8077 if (Values.size() == 1) {
8078 if (EVTBits == 32) {
8079 // Instead of a shuffle like this:
8080 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8081 // Check if it's possible to issue this instead.
8082 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8083 unsigned Idx = countTrailingZeros(NonZeros);
8084 SDValue Item = Op.getOperand(Idx);
8085 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8086 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8091 // A vector full of immediates; various special cases are already
8092 // handled, so this is best done with a single constant-pool load.
8096 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8099 // See if we can use a vector load to get all of the elements.
8100 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
8101 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8103 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8107 // For AVX-length vectors, build the individual 128-bit pieces and use
8108 // shuffles to put them in place.
8109 if (VT.is256BitVector() || VT.is512BitVector()) {
8110 EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
8112 // Build both the lower and upper subvector.
8114 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8115 SDValue Upper = DAG.getBuildVector(
8116 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8118 // Recreate the wider vector with the lower and upper part.
8119 if (VT.is256BitVector())
8120 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8121 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8124 // Let legalizer expand 2-wide build_vectors.
8125 if (EVTBits == 64) {
8126 if (NumNonZero == 1) {
8127 // One half is zero or undef.
8128 unsigned Idx = countTrailingZeros(NonZeros);
8129 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8130 Op.getOperand(Idx));
8131 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8136 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8137 if (EVTBits == 8 && NumElems == 16)
8138 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8142 if (EVTBits == 16 && NumElems == 8)
8143 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8147 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8148 if (EVTBits == 32 && NumElems == 4)
8149 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8152 // If element VT is == 32 bits, turn it into a number of shuffles.
8153 if (NumElems == 4 && NumZero > 0) {
8154 SmallVector<SDValue, 8> Ops(NumElems);
8155 for (unsigned i = 0; i < 4; ++i) {
8156 bool isZero = !(NonZeros & (1ULL << i));
8158 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8160 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8163 for (unsigned i = 0; i < 2; ++i) {
8164 switch ((NonZeros >> (i*2)) & 0x3) {
8165 default: llvm_unreachable("Unexpected NonZero count");
8167 Ops[i] = Ops[i*2]; // Must be a zero vector.
8170 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8173 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8176 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8181 bool Reverse1 = (NonZeros & 0x3) == 2;
8182 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8186 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8187 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8189 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8192 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8194 // Check for a build vector from mostly shuffle plus few inserting.
8195 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8198 // For SSE 4.1, use insertps to put the high elements into the low element.
8199 if (Subtarget.hasSSE41()) {
8201 if (!Op.getOperand(0).isUndef())
8202 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8204 Result = DAG.getUNDEF(VT);
8206 for (unsigned i = 1; i < NumElems; ++i) {
8207 if (Op.getOperand(i).isUndef()) continue;
8208 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8209 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8214 // Otherwise, expand into a number of unpckl*, start by extending each of
8215 // our (non-undef) elements to the full vector width with the element in the
8216 // bottom slot of the vector (which generates no code for SSE).
8217 SmallVector<SDValue, 8> Ops(NumElems);
8218 for (unsigned i = 0; i < NumElems; ++i) {
8219 if (!Op.getOperand(i).isUndef())
8220 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8222 Ops[i] = DAG.getUNDEF(VT);
8225 // Next, we iteratively mix elements, e.g. for v4f32:
8226 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8227 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8228 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8229 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8230 // Generate scaled UNPCKL shuffle mask.
8231 SmallVector<int, 16> Mask;
8232 for(unsigned i = 0; i != Scale; ++i)
8234 for (unsigned i = 0; i != Scale; ++i)
8235 Mask.push_back(NumElems+i);
8236 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8238 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8239 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8244 // 256-bit AVX can use the vinsertf128 instruction
8245 // to create 256-bit vectors from two other 128-bit ones.
8246 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
8248 MVT ResVT = Op.getSimpleValueType();
8250 assert((ResVT.is256BitVector() ||
8251 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8253 SDValue V1 = Op.getOperand(0);
8254 SDValue V2 = Op.getOperand(1);
8255 unsigned NumElems = ResVT.getVectorNumElements();
8256 if (ResVT.is256BitVector())
8257 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8259 if (Op.getNumOperands() == 4) {
8260 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8261 ResVT.getVectorNumElements()/2);
8262 SDValue V3 = Op.getOperand(2);
8263 SDValue V4 = Op.getOperand(3);
8264 return concat256BitVectors(
8265 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
8266 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
8269 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8272 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8273 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8274 static bool isExpandWithZeros(const SDValue &Op) {
8275 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8276 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8278 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8279 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8285 // Returns true if the given node is a type promotion (by concatenating i1
8286 // zeros) of the result of a node that already zeros all upper bits of
8288 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8289 unsigned Opc = Op.getOpcode();
8291 assert(Opc == ISD::CONCAT_VECTORS &&
8292 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8293 "Unexpected node to check for type promotion!");
8295 // As long as we are concatenating zeros to the upper part of a previous node
8296 // result, climb up the tree until a node with different opcode is
8298 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8299 if (Opc == ISD::INSERT_SUBVECTOR) {
8300 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8301 Op.getConstantOperandVal(2) == 0)
8302 Op = Op.getOperand(1);
8305 } else { // Opc == ISD::CONCAT_VECTORS
8306 if (isExpandWithZeros(Op))
8307 Op = Op.getOperand(0);
8311 Opc = Op.getOpcode();
8314 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8315 // of a node that zeros the upper bits (its masked version).
8316 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8317 (Op.getOpcode() == ISD::AND &&
8318 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8319 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8326 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8327 const X86Subtarget &Subtarget,
8328 SelectionDAG & DAG) {
8330 MVT ResVT = Op.getSimpleValueType();
8331 unsigned NumOperands = Op.getNumOperands();
8333 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8334 "Unexpected number of operands in CONCAT_VECTORS");
8336 // If this node promotes - by concatenating zeroes - the type of the result
8337 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8338 // output register, mark it as legal and catch the pattern in instruction
8339 // selection to avoid emitting extra instructions (for zeroing upper bits).
8340 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8341 SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
8342 SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
8343 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8347 unsigned NumZero = 0;
8348 unsigned NumNonZero = 0;
8349 uint64_t NonZeros = 0;
8350 for (unsigned i = 0; i != NumOperands; ++i) {
8351 SDValue SubVec = Op.getOperand(i);
8352 if (SubVec.isUndef())
8354 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8357 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8358 NonZeros |= (uint64_t)1 << i;
8364 // If there are zero or one non-zeros we can handle this very simply.
8365 if (NumNonZero <= 1) {
8366 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8367 : DAG.getUNDEF(ResVT);
8370 unsigned Idx = countTrailingZeros(NonZeros);
8371 SDValue SubVec = Op.getOperand(Idx);
8372 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8373 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8374 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8377 if (NumOperands > 2) {
8378 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8379 ResVT.getVectorNumElements()/2);
8380 ArrayRef<SDUse> Ops = Op->ops();
8381 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8382 Ops.slice(0, NumOperands/2));
8383 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8384 Ops.slice(NumOperands/2));
8385 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8388 assert(NumNonZero == 2 && "Simple cases not handled?");
8390 if (ResVT.getVectorNumElements() >= 16)
8391 return Op; // The operation is legal with KUNPCK
8393 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8394 DAG.getUNDEF(ResVT), Op.getOperand(0),
8395 DAG.getIntPtrConstant(0, dl));
8396 unsigned NumElems = ResVT.getVectorNumElements();
8397 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8398 DAG.getIntPtrConstant(NumElems/2, dl));
8401 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8402 const X86Subtarget &Subtarget,
8403 SelectionDAG &DAG) {
8404 MVT VT = Op.getSimpleValueType();
8405 if (VT.getVectorElementType() == MVT::i1)
8406 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8408 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8409 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8410 Op.getNumOperands() == 4)));
8412 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8413 // from two other 128-bit ones.
8415 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8416 return LowerAVXCONCAT_VECTORS(Op, DAG);
8419 //===----------------------------------------------------------------------===//
8420 // Vector shuffle lowering
8422 // This is an experimental code path for lowering vector shuffles on x86. It is
8423 // designed to handle arbitrary vector shuffles and blends, gracefully
8424 // degrading performance as necessary. It works hard to recognize idiomatic
8425 // shuffles and lower them to optimal instruction patterns without leaving
8426 // a framework that allows reasonably efficient handling of all vector shuffle
8428 //===----------------------------------------------------------------------===//
8430 /// \brief Tiny helper function to identify a no-op mask.
8432 /// This is a somewhat boring predicate function. It checks whether the mask
8433 /// array input, which is assumed to be a single-input shuffle mask of the kind
8434 /// used by the X86 shuffle instructions (not a fully general
8435 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8436 /// in-place shuffle are 'no-op's.
8437 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8438 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8439 assert(Mask[i] >= -1 && "Out of bound mask element!");
8440 if (Mask[i] >= 0 && Mask[i] != i)
8446 /// \brief Test whether there are elements crossing 128-bit lanes in this
8449 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8450 /// and we routinely test for these.
8451 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8452 int LaneSize = 128 / VT.getScalarSizeInBits();
8453 int Size = Mask.size();
8454 for (int i = 0; i < Size; ++i)
8455 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8460 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8462 /// This checks a shuffle mask to see if it is performing the same
8463 /// lane-relative shuffle in each sub-lane. This trivially implies
8464 /// that it is also not lane-crossing. It may however involve a blend from the
8465 /// same lane of a second vector.
8467 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8468 /// non-trivial to compute in the face of undef lanes. The representation is
8469 /// suitable for use with existing 128-bit shuffles as entries from the second
8470 /// vector have been remapped to [LaneSize, 2*LaneSize).
8471 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8473 SmallVectorImpl<int> &RepeatedMask) {
8474 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8475 RepeatedMask.assign(LaneSize, -1);
8476 int Size = Mask.size();
8477 for (int i = 0; i < Size; ++i) {
8478 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8481 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8482 // This entry crosses lanes, so there is no way to model this shuffle.
8485 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8486 // Adjust second vector indices to start at LaneSize instead of Size.
8487 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8488 : Mask[i] % LaneSize + LaneSize;
8489 if (RepeatedMask[i % LaneSize] < 0)
8490 // This is the first non-undef entry in this slot of a 128-bit lane.
8491 RepeatedMask[i % LaneSize] = LocalM;
8492 else if (RepeatedMask[i % LaneSize] != LocalM)
8493 // Found a mismatch with the repeated mask.
8499 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8501 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8502 SmallVectorImpl<int> &RepeatedMask) {
8503 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8506 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8508 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8509 SmallVectorImpl<int> &RepeatedMask) {
8510 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8513 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8514 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8515 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8517 SmallVectorImpl<int> &RepeatedMask) {
8518 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8519 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8520 int Size = Mask.size();
8521 for (int i = 0; i < Size; ++i) {
8522 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8523 if (Mask[i] == SM_SentinelUndef)
8525 if (Mask[i] == SM_SentinelZero) {
8526 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8528 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8531 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8532 // This entry crosses lanes, so there is no way to model this shuffle.
8535 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8536 // Adjust second vector indices to start at LaneSize instead of Size.
8538 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8539 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8540 // This is the first non-undef entry in this slot of a 128-bit lane.
8541 RepeatedMask[i % LaneSize] = LocalM;
8542 else if (RepeatedMask[i % LaneSize] != LocalM)
8543 // Found a mismatch with the repeated mask.
8549 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8552 /// This is a fast way to test a shuffle mask against a fixed pattern:
8554 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8556 /// It returns true if the mask is exactly as wide as the argument list, and
8557 /// each element of the mask is either -1 (signifying undef) or the value given
8558 /// in the argument.
8559 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8560 ArrayRef<int> ExpectedMask) {
8561 if (Mask.size() != ExpectedMask.size())
8564 int Size = Mask.size();
8566 // If the values are build vectors, we can look through them to find
8567 // equivalent inputs that make the shuffles equivalent.
8568 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8569 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8571 for (int i = 0; i < Size; ++i) {
8572 assert(Mask[i] >= -1 && "Out of bound mask element!");
8573 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8574 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8575 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8576 if (!MaskBV || !ExpectedBV ||
8577 MaskBV->getOperand(Mask[i] % Size) !=
8578 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8586 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8588 /// The masks must be exactly the same width.
8590 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8591 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8593 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8594 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8595 ArrayRef<int> ExpectedMask) {
8596 int Size = Mask.size();
8597 if (Size != (int)ExpectedMask.size())
8600 for (int i = 0; i < Size; ++i)
8601 if (Mask[i] == SM_SentinelUndef)
8603 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8605 else if (Mask[i] != ExpectedMask[i])
8611 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8613 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8614 const APInt &Zeroable) {
8615 int NumElts = Mask.size();
8616 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8618 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8619 for (int i = 0; i != NumElts; ++i) {
8621 if (M == SM_SentinelUndef)
8623 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8624 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8629 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8631 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8632 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8635 SmallVector<int, 8> Unpcklwd;
8636 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8637 /* Unary = */ false);
8638 SmallVector<int, 8> Unpckhwd;
8639 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8640 /* Unary = */ false);
8641 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8642 isTargetShuffleEquivalent(Mask, Unpckhwd));
8643 return IsUnpackwdMask;
8646 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8648 /// This helper function produces an 8-bit shuffle immediate corresponding to
8649 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8650 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8653 /// NB: We rely heavily on "undef" masks preserving the input lane.
8654 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8655 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8656 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8657 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8658 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8659 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8662 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8663 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8664 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8665 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8669 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8670 SelectionDAG &DAG) {
8671 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8674 /// \brief Compute whether each element of a shuffle is zeroable.
8676 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8677 /// Either it is an undef element in the shuffle mask, the element of the input
8678 /// referenced is undef, or the element of the input referenced is known to be
8679 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8680 /// as many lanes with this technique as possible to simplify the remaining
8682 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8683 SDValue V1, SDValue V2) {
8684 APInt Zeroable(Mask.size(), 0);
8685 V1 = peekThroughBitcasts(V1);
8686 V2 = peekThroughBitcasts(V2);
8688 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8689 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8691 int VectorSizeInBits = V1.getValueSizeInBits();
8692 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8693 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8695 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8697 // Handle the easy cases.
8698 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8703 // Determine shuffle input and normalize the mask.
8704 SDValue V = M < Size ? V1 : V2;
8707 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8708 if (V.getOpcode() != ISD::BUILD_VECTOR)
8711 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8712 // the (larger) source element must be UNDEF/ZERO.
8713 if ((Size % V.getNumOperands()) == 0) {
8714 int Scale = Size / V->getNumOperands();
8715 SDValue Op = V.getOperand(M / Scale);
8716 if (Op.isUndef() || X86::isZeroNode(Op))
8718 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8719 APInt Val = Cst->getAPIntValue();
8720 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8721 Val = Val.getLoBits(ScalarSizeInBits);
8724 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8725 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8726 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8727 Val = Val.getLoBits(ScalarSizeInBits);
8734 // If the BUILD_VECTOR has more elements then all the (smaller) source
8735 // elements must be UNDEF or ZERO.
8736 if ((V.getNumOperands() % Size) == 0) {
8737 int Scale = V->getNumOperands() / Size;
8738 bool AllZeroable = true;
8739 for (int j = 0; j < Scale; ++j) {
8740 SDValue Op = V.getOperand((M * Scale) + j);
8741 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8752 // The Shuffle result is as follow:
8753 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8754 // Each Zeroable's element correspond to a particular Mask's element.
8755 // As described in computeZeroableShuffleElements function.
8757 // The function looks for a sub-mask that the nonzero elements are in
8758 // increasing order. If such sub-mask exist. The function returns true.
8759 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8760 ArrayRef<int> Mask, const EVT &VectorType,
8761 bool &IsZeroSideLeft) {
8762 int NextElement = -1;
8763 // Check if the Mask's nonzero elements are in increasing order.
8764 for (int i = 0, e = Mask.size(); i < e; i++) {
8765 // Checks if the mask's zeros elements are built from only zeros.
8766 assert(Mask[i] >= -1 && "Out of bound mask element!");
8771 // Find the lowest non zero element
8772 if (NextElement < 0) {
8773 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8774 IsZeroSideLeft = NextElement != 0;
8776 // Exit if the mask's non zero elements are not in increasing order.
8777 if (NextElement != Mask[i])
8784 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8785 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8786 ArrayRef<int> Mask, SDValue V1,
8788 const APInt &Zeroable,
8789 const X86Subtarget &Subtarget,
8790 SelectionDAG &DAG) {
8791 int Size = Mask.size();
8792 int LaneSize = 128 / VT.getScalarSizeInBits();
8793 const int NumBytes = VT.getSizeInBits() / 8;
8794 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8796 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8797 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8798 (Subtarget.hasBWI() && VT.is512BitVector()));
8800 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8801 // Sign bit set in i8 mask means zero element.
8802 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8805 for (int i = 0; i < NumBytes; ++i) {
8806 int M = Mask[i / NumEltBytes];
8808 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8811 if (Zeroable[i / NumEltBytes]) {
8812 PSHUFBMask[i] = ZeroMask;
8816 // We can only use a single input of V1 or V2.
8817 SDValue SrcV = (M >= Size ? V2 : V1);
8823 // PSHUFB can't cross lanes, ensure this doesn't happen.
8824 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8828 M = M * NumEltBytes + (i % NumEltBytes);
8829 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8831 assert(V && "Failed to find a source input");
8833 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8834 return DAG.getBitcast(
8835 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8836 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8839 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8840 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8843 // X86 has dedicated shuffle that can be lowered to VEXPAND
8844 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8845 const APInt &Zeroable,
8846 ArrayRef<int> Mask, SDValue &V1,
8847 SDValue &V2, SelectionDAG &DAG,
8848 const X86Subtarget &Subtarget) {
8849 bool IsLeftZeroSide = true;
8850 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8853 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8855 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8856 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8857 unsigned NumElts = VT.getVectorNumElements();
8858 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8859 "Unexpected number of vector elements");
8860 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8861 Subtarget, DAG, DL);
8862 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8863 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8864 return DAG.getSelect(DL, VT, VMask,
8865 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8869 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8870 unsigned &UnpackOpcode, bool IsUnary,
8871 ArrayRef<int> TargetMask, SDLoc &DL,
8873 const X86Subtarget &Subtarget) {
8874 int NumElts = VT.getVectorNumElements();
8876 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8877 for (int i = 0; i != NumElts; i += 2) {
8878 int M1 = TargetMask[i + 0];
8879 int M2 = TargetMask[i + 1];
8880 Undef1 &= (SM_SentinelUndef == M1);
8881 Undef2 &= (SM_SentinelUndef == M2);
8882 Zero1 &= isUndefOrZero(M1);
8883 Zero2 &= isUndefOrZero(M2);
8885 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8886 "Zeroable shuffle detected");
8888 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8889 SmallVector<int, 64> Unpckl, Unpckh;
8890 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8891 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8892 UnpackOpcode = X86ISD::UNPCKL;
8893 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8894 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8898 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8899 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8900 UnpackOpcode = X86ISD::UNPCKH;
8901 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8902 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8906 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8907 if (IsUnary && (Zero1 || Zero2)) {
8908 // Don't bother if we can blend instead.
8909 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8910 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8913 bool MatchLo = true, MatchHi = true;
8914 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8915 int M = TargetMask[i];
8917 // Ignore if the input is known to be zero or the index is undef.
8918 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8919 (M == SM_SentinelUndef))
8922 MatchLo &= (M == Unpckl[i]);
8923 MatchHi &= (M == Unpckh[i]);
8926 if (MatchLo || MatchHi) {
8927 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8928 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8929 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8934 // If a binary shuffle, commute and try again.
8936 ShuffleVectorSDNode::commuteMask(Unpckl);
8937 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8938 UnpackOpcode = X86ISD::UNPCKL;
8943 ShuffleVectorSDNode::commuteMask(Unpckh);
8944 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8945 UnpackOpcode = X86ISD::UNPCKH;
8954 // X86 has dedicated unpack instructions that can handle specific blend
8955 // operations: UNPCKH and UNPCKL.
8956 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8957 ArrayRef<int> Mask, SDValue V1,
8958 SDValue V2, SelectionDAG &DAG) {
8959 SmallVector<int, 8> Unpckl;
8960 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8961 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8962 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8964 SmallVector<int, 8> Unpckh;
8965 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8966 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8967 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8969 // Commute and try again.
8970 ShuffleVectorSDNode::commuteMask(Unpckl);
8971 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8972 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8974 ShuffleVectorSDNode::commuteMask(Unpckh);
8975 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8976 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8981 // X86 has dedicated pack instructions that can handle specific truncation
8982 // operations: PACKSS and PACKUS.
8983 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
8984 SDValue &V2, unsigned &PackOpcode,
8985 ArrayRef<int> TargetMask,
8987 const X86Subtarget &Subtarget) {
8988 unsigned NumElts = VT.getVectorNumElements();
8989 unsigned BitSize = VT.getScalarSizeInBits();
8990 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
8991 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
8993 auto MatchPACK = [&](SDValue N1, SDValue N2) {
8994 SDValue VV1 = DAG.getBitcast(PackVT, N1);
8995 SDValue VV2 = DAG.getBitcast(PackVT, N2);
8996 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
8997 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9001 PackOpcode = X86ISD::PACKSS;
9005 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9006 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9007 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9008 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9012 PackOpcode = X86ISD::PACKUS;
9020 // Try binary shuffle.
9021 SmallVector<int, 32> BinaryMask;
9022 createPackShuffleMask(VT, BinaryMask, false);
9023 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9024 if (MatchPACK(V1, V2))
9027 // Try unary shuffle.
9028 SmallVector<int, 32> UnaryMask;
9029 createPackShuffleMask(VT, UnaryMask, true);
9030 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9031 if (MatchPACK(V1, V1))
9037 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9038 ArrayRef<int> Mask, SDValue V1,
9039 SDValue V2, SelectionDAG &DAG,
9040 const X86Subtarget &Subtarget) {
9042 unsigned PackOpcode;
9043 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9045 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9046 DAG.getBitcast(PackVT, V2));
9051 /// \brief Try to emit a bitmask instruction for a shuffle.
9053 /// This handles cases where we can model a blend exactly as a bitmask due to
9054 /// one of the inputs being zeroable.
9055 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9056 SDValue V2, ArrayRef<int> Mask,
9057 const APInt &Zeroable,
9058 SelectionDAG &DAG) {
9059 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9060 MVT EltVT = VT.getVectorElementType();
9061 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9062 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9063 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9065 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9068 if (Mask[i] % Size != i)
9069 return SDValue(); // Not a blend.
9071 V = Mask[i] < Size ? V1 : V2;
9072 else if (V != (Mask[i] < Size ? V1 : V2))
9073 return SDValue(); // Can only let one input through the mask.
9075 VMaskOps[i] = AllOnes;
9078 return SDValue(); // No non-zeroable elements!
9080 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9081 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9084 /// \brief Try to emit a blend instruction for a shuffle using bit math.
9086 /// This is used as a fallback approach when first class blend instructions are
9087 /// unavailable. Currently it is only suitable for integer vectors, but could
9088 /// be generalized for floating point vectors if desirable.
9089 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9090 SDValue V2, ArrayRef<int> Mask,
9091 SelectionDAG &DAG) {
9092 assert(VT.isInteger() && "Only supports integer vector types!");
9093 MVT EltVT = VT.getVectorElementType();
9094 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9095 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9096 SmallVector<SDValue, 16> MaskOps;
9097 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9098 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9099 return SDValue(); // Shuffled input!
9100 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9103 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9104 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9105 // We have to cast V2 around.
9106 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9107 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9108 DAG.getBitcast(MaskVT, V1Mask),
9109 DAG.getBitcast(MaskVT, V2)));
9110 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9113 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9114 SDValue PreservedSrc,
9115 const X86Subtarget &Subtarget,
9118 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9119 MutableArrayRef<int> TargetMask,
9120 bool &ForceV1Zero, bool &ForceV2Zero,
9121 uint64_t &BlendMask) {
9122 bool V1IsZeroOrUndef =
9123 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9124 bool V2IsZeroOrUndef =
9125 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9128 ForceV1Zero = false, ForceV2Zero = false;
9129 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9131 // Attempt to generate the binary blend mask. If an input is zero then
9132 // we can use any lane.
9133 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9134 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9135 int M = TargetMask[i];
9136 if (M == SM_SentinelUndef)
9140 if (M == i + Size) {
9141 BlendMask |= 1ull << i;
9144 if (M == SM_SentinelZero) {
9145 if (V1IsZeroOrUndef) {
9150 if (V2IsZeroOrUndef) {
9152 BlendMask |= 1ull << i;
9153 TargetMask[i] = i + Size;
9162 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9164 uint64_t ScaledMask = 0;
9165 for (int i = 0; i != Size; ++i)
9166 if (BlendMask & (1ull << i))
9167 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9171 /// \brief Try to emit a blend instruction for a shuffle.
9173 /// This doesn't do any checks for the availability of instructions for blending
9174 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9175 /// be matched in the backend with the type given. What it does check for is
9176 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9177 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9178 SDValue V2, ArrayRef<int> Original,
9179 const APInt &Zeroable,
9180 const X86Subtarget &Subtarget,
9181 SelectionDAG &DAG) {
9182 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9184 uint64_t BlendMask = 0;
9185 bool ForceV1Zero = false, ForceV2Zero = false;
9186 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9190 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9192 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9194 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9196 switch (VT.SimpleTy) {
9201 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9202 DAG.getConstant(BlendMask, DL, MVT::i8));
9206 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9210 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9211 // that instruction.
9212 if (Subtarget.hasAVX2()) {
9213 // Scale the blend by the number of 32-bit dwords per element.
9214 int Scale = VT.getScalarSizeInBits() / 32;
9215 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9216 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9217 V1 = DAG.getBitcast(BlendVT, V1);
9218 V2 = DAG.getBitcast(BlendVT, V2);
9219 return DAG.getBitcast(
9220 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9221 DAG.getConstant(BlendMask, DL, MVT::i8)));
9225 // For integer shuffles we need to expand the mask and cast the inputs to
9226 // v8i16s prior to blending.
9227 int Scale = 8 / VT.getVectorNumElements();
9228 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9229 V1 = DAG.getBitcast(MVT::v8i16, V1);
9230 V2 = DAG.getBitcast(MVT::v8i16, V2);
9231 return DAG.getBitcast(VT,
9232 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9233 DAG.getConstant(BlendMask, DL, MVT::i8)));
9237 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9238 SmallVector<int, 8> RepeatedMask;
9239 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9240 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9241 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9243 for (int i = 0; i < 8; ++i)
9244 if (RepeatedMask[i] >= 8)
9245 BlendMask |= 1ull << i;
9246 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9247 DAG.getConstant(BlendMask, DL, MVT::i8));
9253 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9254 "256-bit byte-blends require AVX2 support!");
9256 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9258 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9259 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9260 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9263 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9264 if (SDValue Masked =
9265 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9268 // Scale the blend by the number of bytes per element.
9269 int Scale = VT.getScalarSizeInBits() / 8;
9271 // This form of blend is always done on bytes. Compute the byte vector
9273 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9275 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9276 // mix of LLVM's code generator and the x86 backend. We tell the code
9277 // generator that boolean values in the elements of an x86 vector register
9278 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9279 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9280 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9281 // of the element (the remaining are ignored) and 0 in that high bit would
9282 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9283 // the LLVM model for boolean values in vector elements gets the relevant
9284 // bit set, it is set backwards and over constrained relative to x86's
9286 SmallVector<SDValue, 32> VSELECTMask;
9287 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9288 for (int j = 0; j < Scale; ++j)
9289 VSELECTMask.push_back(
9290 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9291 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9294 V1 = DAG.getBitcast(BlendVT, V1);
9295 V2 = DAG.getBitcast(BlendVT, V2);
9296 return DAG.getBitcast(
9298 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9308 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9309 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9310 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9313 llvm_unreachable("Not a supported integer vector type!");
9317 /// \brief Try to lower as a blend of elements from two inputs followed by
9318 /// a single-input permutation.
9320 /// This matches the pattern where we can blend elements from two inputs and
9321 /// then reduce the shuffle to a single-input permutation.
9322 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9323 SDValue V1, SDValue V2,
9325 SelectionDAG &DAG) {
9326 // We build up the blend mask while checking whether a blend is a viable way
9327 // to reduce the shuffle.
9328 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9329 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9331 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9335 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9337 if (BlendMask[Mask[i] % Size] < 0)
9338 BlendMask[Mask[i] % Size] = Mask[i];
9339 else if (BlendMask[Mask[i] % Size] != Mask[i])
9340 return SDValue(); // Can't blend in the needed input!
9342 PermuteMask[i] = Mask[i] % Size;
9345 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9346 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9349 /// \brief Generic routine to decompose a shuffle and blend into independent
9350 /// blends and permutes.
9352 /// This matches the extremely common pattern for handling combined
9353 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9354 /// operations. It will try to pick the best arrangement of shuffles and
9356 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9360 SelectionDAG &DAG) {
9361 // Shuffle the input elements into the desired positions in V1 and V2 and
9362 // blend them together.
9363 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9364 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9365 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9366 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9367 if (Mask[i] >= 0 && Mask[i] < Size) {
9368 V1Mask[i] = Mask[i];
9370 } else if (Mask[i] >= Size) {
9371 V2Mask[i] = Mask[i] - Size;
9372 BlendMask[i] = i + Size;
9375 // Try to lower with the simpler initial blend strategy unless one of the
9376 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9377 // shuffle may be able to fold with a load or other benefit. However, when
9378 // we'll have to do 2x as many shuffles in order to achieve this, blending
9379 // first is a better strategy.
9380 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9381 if (SDValue BlendPerm =
9382 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9385 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9386 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9387 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9390 /// \brief Try to lower a vector shuffle as a rotation.
9392 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9393 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9394 ArrayRef<int> Mask) {
9395 int NumElts = Mask.size();
9397 // We need to detect various ways of spelling a rotation:
9398 // [11, 12, 13, 14, 15, 0, 1, 2]
9399 // [-1, 12, 13, 14, -1, -1, 1, -1]
9400 // [-1, -1, -1, -1, -1, -1, 1, 2]
9401 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9402 // [-1, 4, 5, 6, -1, -1, 9, -1]
9403 // [-1, 4, 5, 6, -1, -1, -1, -1]
9406 for (int i = 0; i < NumElts; ++i) {
9408 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9409 "Unexpected mask index.");
9413 // Determine where a rotated vector would have started.
9414 int StartIdx = i - (M % NumElts);
9416 // The identity rotation isn't interesting, stop.
9419 // If we found the tail of a vector the rotation must be the missing
9420 // front. If we found the head of a vector, it must be how much of the
9422 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9425 Rotation = CandidateRotation;
9426 else if (Rotation != CandidateRotation)
9427 // The rotations don't match, so we can't match this mask.
9430 // Compute which value this mask is pointing at.
9431 SDValue MaskV = M < NumElts ? V1 : V2;
9433 // Compute which of the two target values this index should be assigned
9434 // to. This reflects whether the high elements are remaining or the low
9435 // elements are remaining.
9436 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9438 // Either set up this value if we've not encountered it before, or check
9439 // that it remains consistent.
9442 else if (TargetV != MaskV)
9443 // This may be a rotation, but it pulls from the inputs in some
9444 // unsupported interleaving.
9448 // Check that we successfully analyzed the mask, and normalize the results.
9449 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9450 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9462 /// \brief Try to lower a vector shuffle as a byte rotation.
9464 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9465 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9466 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9467 /// try to generically lower a vector shuffle through such an pattern. It
9468 /// does not check for the profitability of lowering either as PALIGNR or
9469 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9470 /// This matches shuffle vectors that look like:
9472 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9474 /// Essentially it concatenates V1 and V2, shifts right by some number of
9475 /// elements, and takes the low elements as the result. Note that while this is
9476 /// specified as a *right shift* because x86 is little-endian, it is a *left
9477 /// rotate* of the vector lanes.
9478 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9479 ArrayRef<int> Mask) {
9480 // Don't accept any shuffles with zero elements.
9481 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9484 // PALIGNR works on 128-bit lanes.
9485 SmallVector<int, 16> RepeatedMask;
9486 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9489 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9493 // PALIGNR rotates bytes, so we need to scale the
9494 // rotation based on how many bytes are in the vector lane.
9495 int NumElts = RepeatedMask.size();
9496 int Scale = 16 / NumElts;
9497 return Rotation * Scale;
9500 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9501 SDValue V1, SDValue V2,
9503 const X86Subtarget &Subtarget,
9504 SelectionDAG &DAG) {
9505 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9507 SDValue Lo = V1, Hi = V2;
9508 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9509 if (ByteRotation <= 0)
9512 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9514 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9515 Lo = DAG.getBitcast(ByteVT, Lo);
9516 Hi = DAG.getBitcast(ByteVT, Hi);
9518 // SSSE3 targets can use the palignr instruction.
9519 if (Subtarget.hasSSSE3()) {
9520 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9521 "512-bit PALIGNR requires BWI instructions");
9522 return DAG.getBitcast(
9523 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9524 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9527 assert(VT.is128BitVector() &&
9528 "Rotate-based lowering only supports 128-bit lowering!");
9529 assert(Mask.size() <= 16 &&
9530 "Can shuffle at most 16 bytes in a 128-bit vector!");
9531 assert(ByteVT == MVT::v16i8 &&
9532 "SSE2 rotate lowering only needed for v16i8!");
9534 // Default SSE2 implementation
9535 int LoByteShift = 16 - ByteRotation;
9536 int HiByteShift = ByteRotation;
9538 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9539 DAG.getConstant(LoByteShift, DL, MVT::i8));
9540 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9541 DAG.getConstant(HiByteShift, DL, MVT::i8));
9542 return DAG.getBitcast(VT,
9543 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9546 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9548 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9549 /// rotation of the concatenation of two vectors; This routine will
9550 /// try to generically lower a vector shuffle through such an pattern.
9552 /// Essentially it concatenates V1 and V2, shifts right by some number of
9553 /// elements, and takes the low elements as the result. Note that while this is
9554 /// specified as a *right shift* because x86 is little-endian, it is a *left
9555 /// rotate* of the vector lanes.
9556 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9557 SDValue V1, SDValue V2,
9559 const X86Subtarget &Subtarget,
9560 SelectionDAG &DAG) {
9561 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9562 "Only 32-bit and 64-bit elements are supported!");
9564 // 128/256-bit vectors are only supported with VLX.
9565 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9566 && "VLX required for 128/256-bit vectors");
9568 SDValue Lo = V1, Hi = V2;
9569 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9573 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9574 DAG.getConstant(Rotation, DL, MVT::i8));
9577 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9579 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9580 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9581 /// matches elements from one of the input vectors shuffled to the left or
9582 /// right with zeroable elements 'shifted in'. It handles both the strictly
9583 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9586 /// PSHL : (little-endian) left bit shift.
9587 /// [ zz, 0, zz, 2 ]
9588 /// [ -1, 4, zz, -1 ]
9589 /// PSRL : (little-endian) right bit shift.
9591 /// [ -1, -1, 7, zz]
9592 /// PSLLDQ : (little-endian) left byte shift
9593 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9594 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9595 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9596 /// PSRLDQ : (little-endian) right byte shift
9597 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9598 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9599 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9600 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9601 unsigned ScalarSizeInBits,
9602 ArrayRef<int> Mask, int MaskOffset,
9603 const APInt &Zeroable,
9604 const X86Subtarget &Subtarget) {
9605 int Size = Mask.size();
9606 unsigned SizeInBits = Size * ScalarSizeInBits;
9608 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9609 for (int i = 0; i < Size; i += Scale)
9610 for (int j = 0; j < Shift; ++j)
9611 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9617 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9618 for (int i = 0; i != Size; i += Scale) {
9619 unsigned Pos = Left ? i + Shift : i;
9620 unsigned Low = Left ? i : i + Shift;
9621 unsigned Len = Scale - Shift;
9622 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9626 int ShiftEltBits = ScalarSizeInBits * Scale;
9627 bool ByteShift = ShiftEltBits > 64;
9628 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9629 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9630 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9632 // Normalize the scale for byte shifts to still produce an i64 element
9634 Scale = ByteShift ? Scale / 2 : Scale;
9636 // We need to round trip through the appropriate type for the shift.
9637 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9638 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9639 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9640 return (int)ShiftAmt;
9643 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9644 // keep doubling the size of the integer elements up to that. We can
9645 // then shift the elements of the integer vector by whole multiples of
9646 // their width within the elements of the larger integer vector. Test each
9647 // multiple to see if we can find a match with the moved element indices
9648 // and that the shifted in elements are all zeroable.
9649 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9650 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9651 for (int Shift = 1; Shift != Scale; ++Shift)
9652 for (bool Left : {true, false})
9653 if (CheckZeros(Shift, Scale, Left)) {
9654 int ShiftAmt = MatchShift(Shift, Scale, Left);
9663 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9664 SDValue V2, ArrayRef<int> Mask,
9665 const APInt &Zeroable,
9666 const X86Subtarget &Subtarget,
9667 SelectionDAG &DAG) {
9668 int Size = Mask.size();
9669 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9675 // Try to match shuffle against V1 shift.
9676 int ShiftAmt = matchVectorShuffleAsShift(
9677 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9679 // If V1 failed, try to match shuffle against V2 shift.
9682 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9683 Mask, Size, Zeroable, Subtarget);
9690 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9691 "Illegal integer vector type");
9692 V = DAG.getBitcast(ShiftVT, V);
9693 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9694 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9695 return DAG.getBitcast(VT, V);
9698 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9699 // Remainder of lower half result is zero and upper half is all undef.
9700 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9701 ArrayRef<int> Mask, uint64_t &BitLen,
9702 uint64_t &BitIdx, const APInt &Zeroable) {
9703 int Size = Mask.size();
9704 int HalfSize = Size / 2;
9705 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9706 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9708 // Upper half must be undefined.
9709 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9712 // Determine the extraction length from the part of the
9713 // lower half that isn't zeroable.
9715 for (; Len > 0; --Len)
9716 if (!Zeroable[Len - 1])
9718 assert(Len > 0 && "Zeroable shuffle mask");
9720 // Attempt to match first Len sequential elements from the lower half.
9723 for (int i = 0; i != Len; ++i) {
9725 if (M == SM_SentinelUndef)
9727 SDValue &V = (M < Size ? V1 : V2);
9730 // The extracted elements must start at a valid index and all mask
9731 // elements must be in the lower half.
9732 if (i > M || M >= HalfSize)
9735 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9743 if (!Src || Idx < 0)
9746 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9747 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9748 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9753 // INSERTQ: Extract lowest Len elements from lower half of second source and
9754 // insert over first source, starting at Idx.
9755 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9756 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9757 ArrayRef<int> Mask, uint64_t &BitLen,
9759 int Size = Mask.size();
9760 int HalfSize = Size / 2;
9761 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9763 // Upper half must be undefined.
9764 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9767 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9770 // Attempt to match first source from mask before insertion point.
9771 if (isUndefInRange(Mask, 0, Idx)) {
9773 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9775 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9781 // Extend the extraction length looking to match both the insertion of
9782 // the second source and the remaining elements of the first.
9783 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9788 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9790 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9796 // Match the remaining elements of the lower half.
9797 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9799 } else if ((!Base || (Base == V1)) &&
9800 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9802 } else if ((!Base || (Base == V2)) &&
9803 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9810 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9811 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9821 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9822 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9823 SDValue V2, ArrayRef<int> Mask,
9824 const APInt &Zeroable,
9825 SelectionDAG &DAG) {
9826 uint64_t BitLen, BitIdx;
9827 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9828 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9829 DAG.getConstant(BitLen, DL, MVT::i8),
9830 DAG.getConstant(BitIdx, DL, MVT::i8));
9832 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9833 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9834 V2 ? V2 : DAG.getUNDEF(VT),
9835 DAG.getConstant(BitLen, DL, MVT::i8),
9836 DAG.getConstant(BitIdx, DL, MVT::i8));
9841 /// \brief Lower a vector shuffle as a zero or any extension.
9843 /// Given a specific number of elements, element bit width, and extension
9844 /// stride, produce either a zero or any extension based on the available
9845 /// features of the subtarget. The extended elements are consecutive and
9846 /// begin and can start from an offsetted element index in the input; to
9847 /// avoid excess shuffling the offset must either being in the bottom lane
9848 /// or at the start of a higher lane. All extended elements must be from
9850 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9851 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9852 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9853 assert(Scale > 1 && "Need a scale to extend.");
9854 int EltBits = VT.getScalarSizeInBits();
9855 int NumElements = VT.getVectorNumElements();
9856 int NumEltsPerLane = 128 / EltBits;
9857 int OffsetLane = Offset / NumEltsPerLane;
9858 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9859 "Only 8, 16, and 32 bit elements can be extended.");
9860 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9861 assert(0 <= Offset && "Extension offset must be positive.");
9862 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9863 "Extension offset must be in the first lane or start an upper lane.");
9865 // Check that an index is in same lane as the base offset.
9866 auto SafeOffset = [&](int Idx) {
9867 return OffsetLane == (Idx / NumEltsPerLane);
9870 // Shift along an input so that the offset base moves to the first element.
9871 auto ShuffleOffset = [&](SDValue V) {
9875 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9876 for (int i = 0; i * Scale < NumElements; ++i) {
9877 int SrcIdx = i + Offset;
9878 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9880 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9883 // Found a valid zext mask! Try various lowering strategies based on the
9884 // input type and available ISA extensions.
9885 if (Subtarget.hasSSE41()) {
9886 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9887 // PUNPCK will catch this in a later shuffle match.
9888 if (Offset && Scale == 2 && VT.is128BitVector())
9890 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9891 NumElements / Scale);
9892 InputV = ShuffleOffset(InputV);
9893 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9894 return DAG.getBitcast(VT, InputV);
9897 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9899 // For any extends we can cheat for larger element sizes and use shuffle
9900 // instructions that can fold with a load and/or copy.
9901 if (AnyExt && EltBits == 32) {
9902 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9904 return DAG.getBitcast(
9905 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9906 DAG.getBitcast(MVT::v4i32, InputV),
9907 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9909 if (AnyExt && EltBits == 16 && Scale > 2) {
9910 int PSHUFDMask[4] = {Offset / 2, -1,
9911 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9912 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9913 DAG.getBitcast(MVT::v4i32, InputV),
9914 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9915 int PSHUFWMask[4] = {1, -1, -1, -1};
9916 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9917 return DAG.getBitcast(
9918 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9919 DAG.getBitcast(MVT::v8i16, InputV),
9920 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9923 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9925 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9926 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9927 assert(VT.is128BitVector() && "Unexpected vector width!");
9929 int LoIdx = Offset * EltBits;
9930 SDValue Lo = DAG.getBitcast(
9931 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9932 DAG.getConstant(EltBits, DL, MVT::i8),
9933 DAG.getConstant(LoIdx, DL, MVT::i8)));
9935 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9936 !SafeOffset(Offset + 1))
9937 return DAG.getBitcast(VT, Lo);
9939 int HiIdx = (Offset + 1) * EltBits;
9940 SDValue Hi = DAG.getBitcast(
9941 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9942 DAG.getConstant(EltBits, DL, MVT::i8),
9943 DAG.getConstant(HiIdx, DL, MVT::i8)));
9944 return DAG.getBitcast(VT,
9945 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9948 // If this would require more than 2 unpack instructions to expand, use
9949 // pshufb when available. We can only use more than 2 unpack instructions
9950 // when zero extending i8 elements which also makes it easier to use pshufb.
9951 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9952 assert(NumElements == 16 && "Unexpected byte vector width!");
9953 SDValue PSHUFBMask[16];
9954 for (int i = 0; i < 16; ++i) {
9955 int Idx = Offset + (i / Scale);
9956 PSHUFBMask[i] = DAG.getConstant(
9957 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9959 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9960 return DAG.getBitcast(
9961 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9962 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9965 // If we are extending from an offset, ensure we start on a boundary that
9966 // we can unpack from.
9967 int AlignToUnpack = Offset % (NumElements / Scale);
9968 if (AlignToUnpack) {
9969 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9970 for (int i = AlignToUnpack; i < NumElements; ++i)
9971 ShMask[i - AlignToUnpack] = i;
9972 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9973 Offset -= AlignToUnpack;
9976 // Otherwise emit a sequence of unpacks.
9978 unsigned UnpackLoHi = X86ISD::UNPCKL;
9979 if (Offset >= (NumElements / 2)) {
9980 UnpackLoHi = X86ISD::UNPCKH;
9981 Offset -= (NumElements / 2);
9984 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9985 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9986 : getZeroVector(InputVT, Subtarget, DAG, DL);
9987 InputV = DAG.getBitcast(InputVT, InputV);
9988 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9992 } while (Scale > 1);
9993 return DAG.getBitcast(VT, InputV);
9996 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9998 /// This routine will try to do everything in its power to cleverly lower
9999 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10000 /// check for the profitability of this lowering, it tries to aggressively
10001 /// match this pattern. It will use all of the micro-architectural details it
10002 /// can to emit an efficient lowering. It handles both blends with all-zero
10003 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10004 /// masking out later).
10006 /// The reason we have dedicated lowering for zext-style shuffles is that they
10007 /// are both incredibly common and often quite performance sensitive.
10008 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10009 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10010 const APInt &Zeroable, const X86Subtarget &Subtarget,
10011 SelectionDAG &DAG) {
10012 int Bits = VT.getSizeInBits();
10013 int NumLanes = Bits / 128;
10014 int NumElements = VT.getVectorNumElements();
10015 int NumEltsPerLane = NumElements / NumLanes;
10016 assert(VT.getScalarSizeInBits() <= 32 &&
10017 "Exceeds 32-bit integer zero extension limit");
10018 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10020 // Define a helper function to check a particular ext-scale and lower to it if
10022 auto Lower = [&](int Scale) -> SDValue {
10024 bool AnyExt = true;
10027 for (int i = 0; i < NumElements; ++i) {
10030 continue; // Valid anywhere but doesn't tell us anything.
10031 if (i % Scale != 0) {
10032 // Each of the extended elements need to be zeroable.
10036 // We no longer are in the anyext case.
10041 // Each of the base elements needs to be consecutive indices into the
10042 // same input vector.
10043 SDValue V = M < NumElements ? V1 : V2;
10044 M = M % NumElements;
10047 Offset = M - (i / Scale);
10048 } else if (InputV != V)
10049 return SDValue(); // Flip-flopping inputs.
10051 // Offset must start in the lowest 128-bit lane or at the start of an
10053 // FIXME: Is it ever worth allowing a negative base offset?
10054 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10055 (Offset % NumEltsPerLane) == 0))
10058 // If we are offsetting, all referenced entries must come from the same
10060 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10063 if ((M % NumElements) != (Offset + (i / Scale)))
10064 return SDValue(); // Non-consecutive strided elements.
10068 // If we fail to find an input, we have a zero-shuffle which should always
10069 // have already been handled.
10070 // FIXME: Maybe handle this here in case during blending we end up with one?
10074 // If we are offsetting, don't extend if we only match a single input, we
10075 // can always do better by using a basic PSHUF or PUNPCK.
10076 if (Offset != 0 && Matches < 2)
10079 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10080 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10083 // The widest scale possible for extending is to a 64-bit integer.
10084 assert(Bits % 64 == 0 &&
10085 "The number of bits in a vector must be divisible by 64 on x86!");
10086 int NumExtElements = Bits / 64;
10088 // Each iteration, try extending the elements half as much, but into twice as
10090 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10091 assert(NumElements % NumExtElements == 0 &&
10092 "The input vector size must be divisible by the extended size.");
10093 if (SDValue V = Lower(NumElements / NumExtElements))
10097 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10101 // Returns one of the source operands if the shuffle can be reduced to a
10102 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10103 auto CanZExtLowHalf = [&]() {
10104 for (int i = NumElements / 2; i != NumElements; ++i)
10107 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10109 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10114 if (SDValue V = CanZExtLowHalf()) {
10115 V = DAG.getBitcast(MVT::v2i64, V);
10116 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10117 return DAG.getBitcast(VT, V);
10120 // No viable ext lowering found.
10124 /// \brief Try to get a scalar value for a specific element of a vector.
10126 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10127 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10128 SelectionDAG &DAG) {
10129 MVT VT = V.getSimpleValueType();
10130 MVT EltVT = VT.getVectorElementType();
10131 V = peekThroughBitcasts(V);
10133 // If the bitcasts shift the element size, we can't extract an equivalent
10134 // element from it.
10135 MVT NewVT = V.getSimpleValueType();
10136 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10139 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10140 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10141 // Ensure the scalar operand is the same size as the destination.
10142 // FIXME: Add support for scalar truncation where possible.
10143 SDValue S = V.getOperand(Idx);
10144 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10145 return DAG.getBitcast(EltVT, S);
10151 /// \brief Helper to test for a load that can be folded with x86 shuffles.
10153 /// This is particularly important because the set of instructions varies
10154 /// significantly based on whether the operand is a load or not.
10155 static bool isShuffleFoldableLoad(SDValue V) {
10156 V = peekThroughBitcasts(V);
10157 return ISD::isNON_EXTLoad(V.getNode());
10160 /// \brief Try to lower insertion of a single element into a zero vector.
10162 /// This is a common pattern that we have especially efficient patterns to lower
10163 /// across all subtarget feature sets.
10164 static SDValue lowerVectorShuffleAsElementInsertion(
10165 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10166 const APInt &Zeroable, const X86Subtarget &Subtarget,
10167 SelectionDAG &DAG) {
10169 MVT EltVT = VT.getVectorElementType();
10172 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10174 bool IsV1Zeroable = true;
10175 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10176 if (i != V2Index && !Zeroable[i]) {
10177 IsV1Zeroable = false;
10181 // Check for a single input from a SCALAR_TO_VECTOR node.
10182 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10183 // all the smarts here sunk into that routine. However, the current
10184 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10185 // vector shuffle lowering is dead.
10186 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10188 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10189 // We need to zext the scalar if it is smaller than an i32.
10190 V2S = DAG.getBitcast(EltVT, V2S);
10191 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10192 // Using zext to expand a narrow element won't work for non-zero
10197 // Zero-extend directly to i32.
10198 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10199 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10201 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10202 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10203 EltVT == MVT::i16) {
10204 // Either not inserting from the low element of the input or the input
10205 // element size is too small to use VZEXT_MOVL to clear the high bits.
10209 if (!IsV1Zeroable) {
10210 // If V1 can't be treated as a zero vector we have fewer options to lower
10211 // this. We can't support integer vectors or non-zero targets cheaply, and
10212 // the V1 elements can't be permuted in any way.
10213 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10214 if (!VT.isFloatingPoint() || V2Index != 0)
10216 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10217 V1Mask[V2Index] = -1;
10218 if (!isNoopShuffleMask(V1Mask))
10220 if (!VT.is128BitVector())
10223 // Otherwise, use MOVSD or MOVSS.
10224 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10225 "Only two types of floating point element types to handle!");
10226 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10230 // This lowering only works for the low element with floating point vectors.
10231 if (VT.isFloatingPoint() && V2Index != 0)
10234 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10236 V2 = DAG.getBitcast(VT, V2);
10238 if (V2Index != 0) {
10239 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10240 // the desired position. Otherwise it is more efficient to do a vector
10241 // shift left. We know that we can do a vector shift left because all
10242 // the inputs are zero.
10243 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10244 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10245 V2Shuffle[V2Index] = 0;
10246 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10248 V2 = DAG.getBitcast(MVT::v16i8, V2);
10250 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10251 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
10252 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
10253 DAG.getDataLayout(), VT)));
10254 V2 = DAG.getBitcast(VT, V2);
10260 /// Try to lower broadcast of a single - truncated - integer element,
10261 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10263 /// This assumes we have AVX2.
10264 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10265 SDValue V0, int BroadcastIdx,
10266 const X86Subtarget &Subtarget,
10267 SelectionDAG &DAG) {
10268 assert(Subtarget.hasAVX2() &&
10269 "We can only lower integer broadcasts with AVX2!");
10271 EVT EltVT = VT.getVectorElementType();
10272 EVT V0VT = V0.getValueType();
10274 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10275 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10277 EVT V0EltVT = V0VT.getVectorElementType();
10278 if (!V0EltVT.isInteger())
10281 const unsigned EltSize = EltVT.getSizeInBits();
10282 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10284 // This is only a truncation if the original element type is larger.
10285 if (V0EltSize <= EltSize)
10288 assert(((V0EltSize % EltSize) == 0) &&
10289 "Scalar type sizes must all be powers of 2 on x86!");
10291 const unsigned V0Opc = V0.getOpcode();
10292 const unsigned Scale = V0EltSize / EltSize;
10293 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10295 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10296 V0Opc != ISD::BUILD_VECTOR)
10299 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10301 // If we're extracting non-least-significant bits, shift so we can truncate.
10302 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10303 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10304 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10305 if (const int OffsetIdx = BroadcastIdx % Scale)
10306 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10307 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
10309 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10310 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10313 /// \brief Try to lower broadcast of a single element.
10315 /// For convenience, this code also bundles all of the subtarget feature set
10316 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10317 /// a convenient way to factor it out.
10318 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10319 SDValue V1, SDValue V2,
10320 ArrayRef<int> Mask,
10321 const X86Subtarget &Subtarget,
10322 SelectionDAG &DAG) {
10323 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10324 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10325 (Subtarget.hasAVX2() && VT.isInteger())))
10328 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10329 // we can only broadcast from a register with AVX2.
10330 unsigned NumElts = Mask.size();
10331 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10333 : X86ISD::VBROADCAST;
10334 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10336 // Check that the mask is a broadcast.
10337 int BroadcastIdx = -1;
10338 for (int i = 0; i != (int)NumElts; ++i) {
10339 SmallVector<int, 8> BroadcastMask(NumElts, i);
10340 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10346 if (BroadcastIdx < 0)
10348 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10349 "a sorted mask where the broadcast "
10352 // Go up the chain of (vector) values to find a scalar load that we can
10353 // combine with the broadcast.
10356 switch (V.getOpcode()) {
10357 case ISD::BITCAST: {
10358 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10359 SDValue VSrc = V.getOperand(0);
10360 unsigned NumEltBits = V.getScalarValueSizeInBits();
10361 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10362 if ((NumEltBits % NumSrcBits) == 0)
10363 BroadcastIdx *= (NumEltBits / NumSrcBits);
10364 else if ((NumSrcBits % NumEltBits) == 0 &&
10365 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10366 BroadcastIdx /= (NumSrcBits / NumEltBits);
10372 case ISD::CONCAT_VECTORS: {
10373 int OperandSize = Mask.size() / V.getNumOperands();
10374 V = V.getOperand(BroadcastIdx / OperandSize);
10375 BroadcastIdx %= OperandSize;
10378 case ISD::INSERT_SUBVECTOR: {
10379 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10380 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10384 int BeginIdx = (int)ConstantIdx->getZExtValue();
10386 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10387 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10388 BroadcastIdx -= BeginIdx;
10399 // Ensure the source vector and BroadcastIdx are for a suitable type.
10400 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10401 unsigned NumEltBits = VT.getScalarSizeInBits();
10402 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10403 if ((NumSrcBits % NumEltBits) == 0)
10404 BroadcastIdx *= (NumSrcBits / NumEltBits);
10405 else if ((NumEltBits % NumSrcBits) == 0 &&
10406 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10407 BroadcastIdx /= (NumEltBits / NumSrcBits);
10411 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10412 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10413 V = DAG.getBitcast(SrcVT, V);
10416 // Check if this is a broadcast of a scalar. We special case lowering
10417 // for scalars so that we can more effectively fold with loads.
10418 // First, look through bitcast: if the original value has a larger element
10419 // type than the shuffle, the broadcast element is in essence truncated.
10420 // Make that explicit to ease folding.
10421 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10422 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10423 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10424 return TruncBroadcast;
10426 MVT BroadcastVT = VT;
10428 // Peek through any bitcast (only useful for loads).
10429 SDValue BC = peekThroughBitcasts(V);
10431 // Also check the simpler case, where we can directly reuse the scalar.
10432 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10433 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10434 V = V.getOperand(BroadcastIdx);
10436 // If we can't broadcast from a register, check that the input is a load.
10437 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10439 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10440 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10441 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10442 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10443 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10448 // If we are broadcasting a load that is only used by the shuffle
10449 // then we can reduce the vector load to the broadcasted scalar load.
10450 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10451 SDValue BaseAddr = Ld->getOperand(1);
10452 EVT SVT = BroadcastVT.getScalarType();
10453 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10454 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10455 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10456 DAG.getMachineFunction().getMachineMemOperand(
10457 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10458 DAG.makeEquivalentMemoryOrdering(Ld, V);
10459 } else if (!BroadcastFromReg) {
10460 // We can't broadcast from a vector register.
10462 } else if (BroadcastIdx != 0) {
10463 // We can only broadcast from the zero-element of a vector register,
10464 // but it can be advantageous to broadcast from the zero-element of a
10466 if (!VT.is256BitVector() && !VT.is512BitVector())
10469 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10470 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10473 // Only broadcast the zero-element of a 128-bit subvector.
10474 unsigned EltSize = VT.getScalarSizeInBits();
10475 if (((BroadcastIdx * EltSize) % 128) != 0)
10478 // The shuffle input might have been a bitcast we looked through; look at
10479 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10480 // later bitcast it to BroadcastVT.
10481 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10482 "Unexpected vector element size");
10483 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
10484 "Unexpected vector size");
10485 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
10488 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10489 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10490 DAG.getBitcast(MVT::f64, V));
10492 // Bitcast back to the same scalar type as BroadcastVT.
10493 MVT SrcVT = V.getSimpleValueType();
10494 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10495 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10496 "Unexpected vector element size");
10497 if (SrcVT.isVector()) {
10498 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10499 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10501 SrcVT = BroadcastVT.getScalarType();
10503 V = DAG.getBitcast(SrcVT, V);
10506 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10507 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10508 V = DAG.getBitcast(MVT::f64, V);
10509 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10510 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10513 // We only support broadcasting from 128-bit vectors to minimize the
10514 // number of patterns we need to deal with in isel. So extract down to
10515 // 128-bits, removing as many bitcasts as possible.
10516 if (SrcVT.getSizeInBits() > 128) {
10517 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
10518 128 / SrcVT.getScalarSizeInBits());
10519 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
10520 V = DAG.getBitcast(ExtVT, V);
10523 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10526 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10527 // INSERTPS when the V1 elements are already in the correct locations
10528 // because otherwise we can just always use two SHUFPS instructions which
10529 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10530 // perform INSERTPS if a single V1 element is out of place and all V2
10531 // elements are zeroable.
10532 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10533 unsigned &InsertPSMask,
10534 const APInt &Zeroable,
10535 ArrayRef<int> Mask,
10536 SelectionDAG &DAG) {
10537 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10538 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10539 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10541 // Attempt to match INSERTPS with one element from VA or VB being
10542 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10544 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10545 ArrayRef<int> CandidateMask) {
10546 unsigned ZMask = 0;
10547 int VADstIndex = -1;
10548 int VBDstIndex = -1;
10549 bool VAUsedInPlace = false;
10551 for (int i = 0; i < 4; ++i) {
10552 // Synthesize a zero mask from the zeroable elements (includes undefs).
10558 // Flag if we use any VA inputs in place.
10559 if (i == CandidateMask[i]) {
10560 VAUsedInPlace = true;
10564 // We can only insert a single non-zeroable element.
10565 if (VADstIndex >= 0 || VBDstIndex >= 0)
10568 if (CandidateMask[i] < 4) {
10569 // VA input out of place for insertion.
10572 // VB input for insertion.
10577 // Don't bother if we have no (non-zeroable) element for insertion.
10578 if (VADstIndex < 0 && VBDstIndex < 0)
10581 // Determine element insertion src/dst indices. The src index is from the
10582 // start of the inserted vector, not the start of the concatenated vector.
10583 unsigned VBSrcIndex = 0;
10584 if (VADstIndex >= 0) {
10585 // If we have a VA input out of place, we use VA as the V2 element
10586 // insertion and don't use the original V2 at all.
10587 VBSrcIndex = CandidateMask[VADstIndex];
10588 VBDstIndex = VADstIndex;
10591 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10594 // If no V1 inputs are used in place, then the result is created only from
10595 // the zero mask and the V2 insertion - so remove V1 dependency.
10596 if (!VAUsedInPlace)
10597 VA = DAG.getUNDEF(MVT::v4f32);
10599 // Update V1, V2 and InsertPSMask accordingly.
10603 // Insert the V2 element into the desired position.
10604 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10605 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10609 if (matchAsInsertPS(V1, V2, Mask))
10612 // Commute and try again.
10613 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10614 ShuffleVectorSDNode::commuteMask(CommutedMask);
10615 if (matchAsInsertPS(V2, V1, CommutedMask))
10621 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10622 SDValue V2, ArrayRef<int> Mask,
10623 const APInt &Zeroable,
10624 SelectionDAG &DAG) {
10625 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10626 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10628 // Attempt to match the insertps pattern.
10629 unsigned InsertPSMask;
10630 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10633 // Insert the V2 element into the desired position.
10634 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10635 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10638 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10639 /// UNPCK instruction.
10641 /// This specifically targets cases where we end up with alternating between
10642 /// the two inputs, and so can permute them into something that feeds a single
10643 /// UNPCK instruction. Note that this routine only targets integer vectors
10644 /// because for floating point vectors we have a generalized SHUFPS lowering
10645 /// strategy that handles everything that doesn't *exactly* match an unpack,
10646 /// making this clever lowering unnecessary.
10647 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10648 SDValue V1, SDValue V2,
10649 ArrayRef<int> Mask,
10650 SelectionDAG &DAG) {
10651 assert(!VT.isFloatingPoint() &&
10652 "This routine only supports integer vectors.");
10653 assert(VT.is128BitVector() &&
10654 "This routine only works on 128-bit vectors.");
10655 assert(!V2.isUndef() &&
10656 "This routine should only be used when blending two inputs.");
10657 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10659 int Size = Mask.size();
10662 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10664 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10666 bool UnpackLo = NumLoInputs >= NumHiInputs;
10668 auto TryUnpack = [&](int ScalarSize, int Scale) {
10669 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10670 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10672 for (int i = 0; i < Size; ++i) {
10676 // Each element of the unpack contains Scale elements from this mask.
10677 int UnpackIdx = i / Scale;
10679 // We only handle the case where V1 feeds the first slots of the unpack.
10680 // We rely on canonicalization to ensure this is the case.
10681 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10684 // Setup the mask for this input. The indexing is tricky as we have to
10685 // handle the unpack stride.
10686 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10687 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10691 // If we will have to shuffle both inputs to use the unpack, check whether
10692 // we can just unpack first and shuffle the result. If so, skip this unpack.
10693 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10694 !isNoopShuffleMask(V2Mask))
10697 // Shuffle the inputs into place.
10698 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10699 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10701 // Cast the inputs to the type we will use to unpack them.
10702 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10703 V1 = DAG.getBitcast(UnpackVT, V1);
10704 V2 = DAG.getBitcast(UnpackVT, V2);
10706 // Unpack the inputs and cast the result back to the desired type.
10707 return DAG.getBitcast(
10708 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10709 UnpackVT, V1, V2));
10712 // We try each unpack from the largest to the smallest to try and find one
10713 // that fits this mask.
10714 int OrigScalarSize = VT.getScalarSizeInBits();
10715 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10716 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10719 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10721 if (NumLoInputs == 0 || NumHiInputs == 0) {
10722 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10723 "We have to have *some* inputs!");
10724 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10726 // FIXME: We could consider the total complexity of the permute of each
10727 // possible unpacking. Or at the least we should consider how many
10728 // half-crossings are created.
10729 // FIXME: We could consider commuting the unpacks.
10731 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10732 for (int i = 0; i < Size; ++i) {
10736 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10739 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10741 return DAG.getVectorShuffle(
10742 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10744 DAG.getUNDEF(VT), PermMask);
10750 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10752 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10753 /// support for floating point shuffles but not integer shuffles. These
10754 /// instructions will incur a domain crossing penalty on some chips though so
10755 /// it is better to avoid lowering through this for integer vectors where
10757 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10758 const APInt &Zeroable,
10759 SDValue V1, SDValue V2,
10760 const X86Subtarget &Subtarget,
10761 SelectionDAG &DAG) {
10762 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10763 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10764 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10766 if (V2.isUndef()) {
10767 // Check for being able to broadcast a single element.
10768 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10769 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10772 // Straight shuffle of a single input vector. Simulate this by using the
10773 // single input as both of the "inputs" to this instruction..
10774 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10776 if (Subtarget.hasAVX()) {
10777 // If we have AVX, we can use VPERMILPS which will allow folding a load
10778 // into the shuffle.
10779 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10780 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10783 return DAG.getNode(
10784 X86ISD::SHUFP, DL, MVT::v2f64,
10785 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10786 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10787 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10789 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10790 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10792 // If we have a single input, insert that into V1 if we can do so cheaply.
10793 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10794 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10795 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10797 // Try inverting the insertion since for v2 masks it is easy to do and we
10798 // can't reliably sort the mask one way or the other.
10799 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10800 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10801 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10802 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10806 // Try to use one of the special instruction patterns to handle two common
10807 // blend patterns if a zero-blend above didn't work.
10808 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10809 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10810 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10811 // We can either use a special instruction to load over the low double or
10812 // to move just the low double.
10813 return DAG.getNode(
10814 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10815 DL, MVT::v2f64, V2,
10816 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10818 if (Subtarget.hasSSE41())
10819 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10820 Zeroable, Subtarget, DAG))
10823 // Use dedicated unpack instructions for masks that match their pattern.
10825 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10828 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10829 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10830 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10833 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10835 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10836 /// the integer unit to minimize domain crossing penalties. However, for blends
10837 /// it falls back to the floating point shuffle operation with appropriate bit
10839 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10840 const APInt &Zeroable,
10841 SDValue V1, SDValue V2,
10842 const X86Subtarget &Subtarget,
10843 SelectionDAG &DAG) {
10844 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10845 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10846 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10848 if (V2.isUndef()) {
10849 // Check for being able to broadcast a single element.
10850 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10851 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10854 // Straight shuffle of a single input vector. For everything from SSE2
10855 // onward this has a single fast instruction with no scary immediates.
10856 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10857 V1 = DAG.getBitcast(MVT::v4i32, V1);
10858 int WidenedMask[4] = {
10859 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10860 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10861 return DAG.getBitcast(
10863 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10864 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10866 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10867 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10868 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10869 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10871 // Try to use shift instructions.
10872 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10873 Zeroable, Subtarget, DAG))
10876 // When loading a scalar and then shuffling it into a vector we can often do
10877 // the insertion cheaply.
10878 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10879 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10881 // Try inverting the insertion since for v2 masks it is easy to do and we
10882 // can't reliably sort the mask one way or the other.
10883 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10884 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10885 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10888 // We have different paths for blend lowering, but they all must use the
10889 // *exact* same predicate.
10890 bool IsBlendSupported = Subtarget.hasSSE41();
10891 if (IsBlendSupported)
10892 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10893 Zeroable, Subtarget, DAG))
10896 // Use dedicated unpack instructions for masks that match their pattern.
10898 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10901 // Try to use byte rotation instructions.
10902 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10903 if (Subtarget.hasSSSE3()) {
10904 if (Subtarget.hasVLX())
10905 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
10906 Mask, Subtarget, DAG))
10909 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10910 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10914 // If we have direct support for blends, we should lower by decomposing into
10915 // a permute. That will be faster than the domain cross.
10916 if (IsBlendSupported)
10917 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10920 // We implement this with SHUFPD which is pretty lame because it will likely
10921 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10922 // However, all the alternatives are still more cycles and newer chips don't
10923 // have this problem. It would be really nice if x86 had better shuffles here.
10924 V1 = DAG.getBitcast(MVT::v2f64, V1);
10925 V2 = DAG.getBitcast(MVT::v2f64, V2);
10926 return DAG.getBitcast(MVT::v2i64,
10927 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10930 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10932 /// This is used to disable more specialized lowerings when the shufps lowering
10933 /// will happen to be efficient.
10934 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10935 // This routine only handles 128-bit shufps.
10936 assert(Mask.size() == 4 && "Unsupported mask size!");
10937 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10938 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10939 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10940 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10942 // To lower with a single SHUFPS we need to have the low half and high half
10943 // each requiring a single input.
10944 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10946 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10952 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10954 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10955 /// It makes no assumptions about whether this is the *best* lowering, it simply
10957 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10958 ArrayRef<int> Mask, SDValue V1,
10959 SDValue V2, SelectionDAG &DAG) {
10960 SDValue LowV = V1, HighV = V2;
10961 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10963 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10965 if (NumV2Elements == 1) {
10966 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10968 // Compute the index adjacent to V2Index and in the same half by toggling
10970 int V2AdjIndex = V2Index ^ 1;
10972 if (Mask[V2AdjIndex] < 0) {
10973 // Handles all the cases where we have a single V2 element and an undef.
10974 // This will only ever happen in the high lanes because we commute the
10975 // vector otherwise.
10977 std::swap(LowV, HighV);
10978 NewMask[V2Index] -= 4;
10980 // Handle the case where the V2 element ends up adjacent to a V1 element.
10981 // To make this work, blend them together as the first step.
10982 int V1Index = V2AdjIndex;
10983 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10984 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10985 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10987 // Now proceed to reconstruct the final blend as we have the necessary
10988 // high or low half formed.
10995 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10996 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10998 } else if (NumV2Elements == 2) {
10999 if (Mask[0] < 4 && Mask[1] < 4) {
11000 // Handle the easy case where we have V1 in the low lanes and V2 in the
11004 } else if (Mask[2] < 4 && Mask[3] < 4) {
11005 // We also handle the reversed case because this utility may get called
11006 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11007 // arrange things in the right direction.
11013 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11014 // trying to place elements directly, just blend them and set up the final
11015 // shuffle to place them.
11017 // The first two blend mask elements are for V1, the second two are for
11019 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11020 Mask[2] < 4 ? Mask[2] : Mask[3],
11021 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11022 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11023 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11024 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11026 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11029 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11030 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11031 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11032 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11035 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11036 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11039 /// \brief Lower 4-lane 32-bit floating point shuffles.
11041 /// Uses instructions exclusively from the floating point unit to minimize
11042 /// domain crossing penalties, as these are sufficient to implement all v4f32
11044 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11045 const APInt &Zeroable,
11046 SDValue V1, SDValue V2,
11047 const X86Subtarget &Subtarget,
11048 SelectionDAG &DAG) {
11049 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11050 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11051 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11053 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11055 if (NumV2Elements == 0) {
11056 // Check for being able to broadcast a single element.
11057 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11058 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11061 // Use even/odd duplicate instructions for masks that match their pattern.
11062 if (Subtarget.hasSSE3()) {
11063 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11064 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11065 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11066 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11069 if (Subtarget.hasAVX()) {
11070 // If we have AVX, we can use VPERMILPS which will allow folding a load
11071 // into the shuffle.
11072 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11073 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11076 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11077 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11078 if (!Subtarget.hasSSE2()) {
11079 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11080 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11081 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11082 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11085 // Otherwise, use a straight shuffle of a single input vector. We pass the
11086 // input vector to both operands to simulate this with a SHUFPS.
11087 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11088 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11091 // There are special ways we can lower some single-element blends. However, we
11092 // have custom ways we can lower more complex single-element blends below that
11093 // we defer to if both this and BLENDPS fail to match, so restrict this to
11094 // when the V2 input is targeting element 0 of the mask -- that is the fast
11096 if (NumV2Elements == 1 && Mask[0] >= 4)
11097 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11098 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11101 if (Subtarget.hasSSE41()) {
11102 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11103 Zeroable, Subtarget, DAG))
11106 // Use INSERTPS if we can complete the shuffle efficiently.
11108 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11111 if (!isSingleSHUFPSMask(Mask))
11112 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11113 DL, MVT::v4f32, V1, V2, Mask, DAG))
11117 // Use low/high mov instructions. These are only valid in SSE1 because
11118 // otherwise they are widened to v2f64 and never get here.
11119 if (!Subtarget.hasSSE2()) {
11120 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11121 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11122 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11123 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11126 // Use dedicated unpack instructions for masks that match their pattern.
11128 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11131 // Otherwise fall back to a SHUFPS lowering strategy.
11132 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11135 /// \brief Lower 4-lane i32 vector shuffles.
11137 /// We try to handle these with integer-domain shuffles where we can, but for
11138 /// blends we use the floating point domain blend instructions.
11139 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11140 const APInt &Zeroable,
11141 SDValue V1, SDValue V2,
11142 const X86Subtarget &Subtarget,
11143 SelectionDAG &DAG) {
11144 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11145 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11146 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11148 // Whenever we can lower this as a zext, that instruction is strictly faster
11149 // than any alternative. It also allows us to fold memory operands into the
11150 // shuffle in many cases.
11151 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11152 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11155 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11157 if (NumV2Elements == 0) {
11158 // Check for being able to broadcast a single element.
11159 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11160 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11163 // Straight shuffle of a single input vector. For everything from SSE2
11164 // onward this has a single fast instruction with no scary immediates.
11165 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11166 // but we aren't actually going to use the UNPCK instruction because doing
11167 // so prevents folding a load into this instruction or making a copy.
11168 const int UnpackLoMask[] = {0, 0, 1, 1};
11169 const int UnpackHiMask[] = {2, 2, 3, 3};
11170 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11171 Mask = UnpackLoMask;
11172 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11173 Mask = UnpackHiMask;
11175 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11176 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11179 // Try to use shift instructions.
11180 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11181 Zeroable, Subtarget, DAG))
11184 // There are special ways we can lower some single-element blends.
11185 if (NumV2Elements == 1)
11186 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11187 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11190 // We have different paths for blend lowering, but they all must use the
11191 // *exact* same predicate.
11192 bool IsBlendSupported = Subtarget.hasSSE41();
11193 if (IsBlendSupported)
11194 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11195 Zeroable, Subtarget, DAG))
11198 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11202 // Use dedicated unpack instructions for masks that match their pattern.
11204 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11207 // Try to use byte rotation instructions.
11208 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11209 if (Subtarget.hasSSSE3()) {
11210 if (Subtarget.hasVLX())
11211 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11212 Mask, Subtarget, DAG))
11215 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11216 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11220 // Assume that a single SHUFPS is faster than an alternative sequence of
11221 // multiple instructions (even if the CPU has a domain penalty).
11222 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11223 if (!isSingleSHUFPSMask(Mask)) {
11224 // If we have direct support for blends, we should lower by decomposing into
11225 // a permute. That will be faster than the domain cross.
11226 if (IsBlendSupported)
11227 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11230 // Try to lower by permuting the inputs into an unpack instruction.
11231 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11232 DL, MVT::v4i32, V1, V2, Mask, DAG))
11236 // We implement this with SHUFPS because it can blend from two vectors.
11237 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11238 // up the inputs, bypassing domain shift penalties that we would incur if we
11239 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11241 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11242 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11243 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11244 return DAG.getBitcast(MVT::v4i32, ShufPS);
11247 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11248 /// shuffle lowering, and the most complex part.
11250 /// The lowering strategy is to try to form pairs of input lanes which are
11251 /// targeted at the same half of the final vector, and then use a dword shuffle
11252 /// to place them onto the right half, and finally unpack the paired lanes into
11253 /// their final position.
11255 /// The exact breakdown of how to form these dword pairs and align them on the
11256 /// correct sides is really tricky. See the comments within the function for
11257 /// more of the details.
11259 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11260 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11261 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11262 /// vector, form the analogous 128-bit 8-element Mask.
11263 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11264 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11265 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11266 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11267 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11269 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11270 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11271 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11273 SmallVector<int, 4> LoInputs;
11274 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11275 std::sort(LoInputs.begin(), LoInputs.end());
11276 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11277 SmallVector<int, 4> HiInputs;
11278 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11279 std::sort(HiInputs.begin(), HiInputs.end());
11280 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11282 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11283 int NumHToL = LoInputs.size() - NumLToL;
11285 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11286 int NumHToH = HiInputs.size() - NumLToH;
11287 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11288 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11289 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11290 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11292 // If we are splatting two values from one half - one to each half, then
11293 // we can shuffle that half so each is splatted to a dword, then splat those
11294 // to their respective halves.
11295 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
11297 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
11298 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
11299 V = DAG.getNode(ShufWOp, DL, VT, V,
11300 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11301 V = DAG.getBitcast(PSHUFDVT, V);
11302 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11303 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11304 return DAG.getBitcast(VT, V);
11307 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
11308 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
11309 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
11310 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
11312 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11313 // such inputs we can swap two of the dwords across the half mark and end up
11314 // with <=2 inputs to each half in each half. Once there, we can fall through
11315 // to the generic code below. For example:
11317 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11318 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11320 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11321 // and an existing 2-into-2 on the other half. In this case we may have to
11322 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11323 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11324 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11325 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11326 // half than the one we target for fixing) will be fixed when we re-enter this
11327 // path. We will also combine away any sequence of PSHUFD instructions that
11328 // result into a single instruction. Here is an example of the tricky case:
11330 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11331 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11333 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11335 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11336 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11338 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11339 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11341 // The result is fine to be handled by the generic logic.
11342 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11343 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11344 int AOffset, int BOffset) {
11345 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11346 "Must call this with A having 3 or 1 inputs from the A half.");
11347 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11348 "Must call this with B having 1 or 3 inputs from the B half.");
11349 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11350 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11352 bool ThreeAInputs = AToAInputs.size() == 3;
11354 // Compute the index of dword with only one word among the three inputs in
11355 // a half by taking the sum of the half with three inputs and subtracting
11356 // the sum of the actual three inputs. The difference is the remaining
11358 int ADWord, BDWord;
11359 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11360 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11361 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11362 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11363 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11364 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11365 int TripleNonInputIdx =
11366 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11367 TripleDWord = TripleNonInputIdx / 2;
11369 // We use xor with one to compute the adjacent DWord to whichever one the
11371 OneInputDWord = (OneInput / 2) ^ 1;
11373 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11374 // and BToA inputs. If there is also such a problem with the BToB and AToB
11375 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11376 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11377 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11378 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11379 // Compute how many inputs will be flipped by swapping these DWords. We
11381 // to balance this to ensure we don't form a 3-1 shuffle in the other
11383 int NumFlippedAToBInputs =
11384 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11385 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11386 int NumFlippedBToBInputs =
11387 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11388 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11389 if ((NumFlippedAToBInputs == 1 &&
11390 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11391 (NumFlippedBToBInputs == 1 &&
11392 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11393 // We choose whether to fix the A half or B half based on whether that
11394 // half has zero flipped inputs. At zero, we may not be able to fix it
11395 // with that half. We also bias towards fixing the B half because that
11396 // will more commonly be the high half, and we have to bias one way.
11397 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11398 ArrayRef<int> Inputs) {
11399 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11400 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11401 // Determine whether the free index is in the flipped dword or the
11402 // unflipped dword based on where the pinned index is. We use this bit
11403 // in an xor to conditionally select the adjacent dword.
11404 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11405 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11406 if (IsFixIdxInput == IsFixFreeIdxInput)
11408 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11409 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11410 "We need to be changing the number of flipped inputs!");
11411 int PSHUFHalfMask[] = {0, 1, 2, 3};
11412 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11414 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11415 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11416 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11418 for (int &M : Mask)
11419 if (M >= 0 && M == FixIdx)
11421 else if (M >= 0 && M == FixFreeIdx)
11424 if (NumFlippedBToBInputs != 0) {
11426 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11427 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11429 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11430 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11431 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11436 int PSHUFDMask[] = {0, 1, 2, 3};
11437 PSHUFDMask[ADWord] = BDWord;
11438 PSHUFDMask[BDWord] = ADWord;
11439 V = DAG.getBitcast(
11441 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11442 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11444 // Adjust the mask to match the new locations of A and B.
11445 for (int &M : Mask)
11446 if (M >= 0 && M/2 == ADWord)
11447 M = 2 * BDWord + M % 2;
11448 else if (M >= 0 && M/2 == BDWord)
11449 M = 2 * ADWord + M % 2;
11451 // Recurse back into this routine to re-compute state now that this isn't
11452 // a 3 and 1 problem.
11453 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11456 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11457 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11458 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11459 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11461 // At this point there are at most two inputs to the low and high halves from
11462 // each half. That means the inputs can always be grouped into dwords and
11463 // those dwords can then be moved to the correct half with a dword shuffle.
11464 // We use at most one low and one high word shuffle to collect these paired
11465 // inputs into dwords, and finally a dword shuffle to place them.
11466 int PSHUFLMask[4] = {-1, -1, -1, -1};
11467 int PSHUFHMask[4] = {-1, -1, -1, -1};
11468 int PSHUFDMask[4] = {-1, -1, -1, -1};
11470 // First fix the masks for all the inputs that are staying in their
11471 // original halves. This will then dictate the targets of the cross-half
11473 auto fixInPlaceInputs =
11474 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11475 MutableArrayRef<int> SourceHalfMask,
11476 MutableArrayRef<int> HalfMask, int HalfOffset) {
11477 if (InPlaceInputs.empty())
11479 if (InPlaceInputs.size() == 1) {
11480 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11481 InPlaceInputs[0] - HalfOffset;
11482 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11485 if (IncomingInputs.empty()) {
11486 // Just fix all of the in place inputs.
11487 for (int Input : InPlaceInputs) {
11488 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11489 PSHUFDMask[Input / 2] = Input / 2;
11494 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11495 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11496 InPlaceInputs[0] - HalfOffset;
11497 // Put the second input next to the first so that they are packed into
11498 // a dword. We find the adjacent index by toggling the low bit.
11499 int AdjIndex = InPlaceInputs[0] ^ 1;
11500 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11501 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11502 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11504 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11505 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11507 // Now gather the cross-half inputs and place them into a free dword of
11508 // their target half.
11509 // FIXME: This operation could almost certainly be simplified dramatically to
11510 // look more like the 3-1 fixing operation.
11511 auto moveInputsToRightHalf = [&PSHUFDMask](
11512 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11513 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11514 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11516 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11517 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11519 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11521 int LowWord = Word & ~1;
11522 int HighWord = Word | 1;
11523 return isWordClobbered(SourceHalfMask, LowWord) ||
11524 isWordClobbered(SourceHalfMask, HighWord);
11527 if (IncomingInputs.empty())
11530 if (ExistingInputs.empty()) {
11531 // Map any dwords with inputs from them into the right half.
11532 for (int Input : IncomingInputs) {
11533 // If the source half mask maps over the inputs, turn those into
11534 // swaps and use the swapped lane.
11535 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11536 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11537 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11538 Input - SourceOffset;
11539 // We have to swap the uses in our half mask in one sweep.
11540 for (int &M : HalfMask)
11541 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11543 else if (M == Input)
11544 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11546 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11547 Input - SourceOffset &&
11548 "Previous placement doesn't match!");
11550 // Note that this correctly re-maps both when we do a swap and when
11551 // we observe the other side of the swap above. We rely on that to
11552 // avoid swapping the members of the input list directly.
11553 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11556 // Map the input's dword into the correct half.
11557 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11558 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11560 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11562 "Previous placement doesn't match!");
11565 // And just directly shift any other-half mask elements to be same-half
11566 // as we will have mirrored the dword containing the element into the
11567 // same position within that half.
11568 for (int &M : HalfMask)
11569 if (M >= SourceOffset && M < SourceOffset + 4) {
11570 M = M - SourceOffset + DestOffset;
11571 assert(M >= 0 && "This should never wrap below zero!");
11576 // Ensure we have the input in a viable dword of its current half. This
11577 // is particularly tricky because the original position may be clobbered
11578 // by inputs being moved and *staying* in that half.
11579 if (IncomingInputs.size() == 1) {
11580 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11581 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11583 SourceHalfMask[InputFixed - SourceOffset] =
11584 IncomingInputs[0] - SourceOffset;
11585 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11587 IncomingInputs[0] = InputFixed;
11589 } else if (IncomingInputs.size() == 2) {
11590 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11591 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11592 // We have two non-adjacent or clobbered inputs we need to extract from
11593 // the source half. To do this, we need to map them into some adjacent
11594 // dword slot in the source mask.
11595 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11596 IncomingInputs[1] - SourceOffset};
11598 // If there is a free slot in the source half mask adjacent to one of
11599 // the inputs, place the other input in it. We use (Index XOR 1) to
11600 // compute an adjacent index.
11601 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11602 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11603 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11604 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11605 InputsFixed[1] = InputsFixed[0] ^ 1;
11606 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11607 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11608 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11609 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11610 InputsFixed[0] = InputsFixed[1] ^ 1;
11611 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11612 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11613 // The two inputs are in the same DWord but it is clobbered and the
11614 // adjacent DWord isn't used at all. Move both inputs to the free
11616 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11617 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11618 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11619 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11621 // The only way we hit this point is if there is no clobbering
11622 // (because there are no off-half inputs to this half) and there is no
11623 // free slot adjacent to one of the inputs. In this case, we have to
11624 // swap an input with a non-input.
11625 for (int i = 0; i < 4; ++i)
11626 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11627 "We can't handle any clobbers here!");
11628 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11629 "Cannot have adjacent inputs here!");
11631 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11632 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11634 // We also have to update the final source mask in this case because
11635 // it may need to undo the above swap.
11636 for (int &M : FinalSourceHalfMask)
11637 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11638 M = InputsFixed[1] + SourceOffset;
11639 else if (M == InputsFixed[1] + SourceOffset)
11640 M = (InputsFixed[0] ^ 1) + SourceOffset;
11642 InputsFixed[1] = InputsFixed[0] ^ 1;
11645 // Point everything at the fixed inputs.
11646 for (int &M : HalfMask)
11647 if (M == IncomingInputs[0])
11648 M = InputsFixed[0] + SourceOffset;
11649 else if (M == IncomingInputs[1])
11650 M = InputsFixed[1] + SourceOffset;
11652 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11653 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11656 llvm_unreachable("Unhandled input size!");
11659 // Now hoist the DWord down to the right half.
11660 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11661 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11662 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11663 for (int &M : HalfMask)
11664 for (int Input : IncomingInputs)
11666 M = FreeDWord * 2 + Input % 2;
11668 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11669 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11670 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11671 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11673 // Now enact all the shuffles we've computed to move the inputs into their
11675 if (!isNoopShuffleMask(PSHUFLMask))
11676 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11677 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11678 if (!isNoopShuffleMask(PSHUFHMask))
11679 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11680 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11681 if (!isNoopShuffleMask(PSHUFDMask))
11682 V = DAG.getBitcast(
11684 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11685 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11687 // At this point, each half should contain all its inputs, and we can then
11688 // just shuffle them into their final position.
11689 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11690 "Failed to lift all the high half inputs to the low mask!");
11691 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11692 "Failed to lift all the low half inputs to the high mask!");
11694 // Do a half shuffle for the low mask.
11695 if (!isNoopShuffleMask(LoMask))
11696 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11697 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11699 // Do a half shuffle with the high mask after shifting its values down.
11700 for (int &M : HiMask)
11703 if (!isNoopShuffleMask(HiMask))
11704 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11705 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11710 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11711 /// blend if only one input is used.
11712 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11713 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11714 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11716 SDValue V1Mask[16];
11717 SDValue V2Mask[16];
11721 int Size = Mask.size();
11722 int Scale = 16 / Size;
11723 for (int i = 0; i < 16; ++i) {
11724 if (Mask[i / Scale] < 0) {
11725 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11727 const int ZeroMask = 0x80;
11728 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11730 int V2Idx = Mask[i / Scale] < Size
11732 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11733 if (Zeroable[i / Scale])
11734 V1Idx = V2Idx = ZeroMask;
11735 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11736 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11737 V1InUse |= (ZeroMask != V1Idx);
11738 V2InUse |= (ZeroMask != V2Idx);
11743 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11744 DAG.getBitcast(MVT::v16i8, V1),
11745 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11747 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11748 DAG.getBitcast(MVT::v16i8, V2),
11749 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11751 // If we need shuffled inputs from both, blend the two.
11753 if (V1InUse && V2InUse)
11754 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11756 V = V1InUse ? V1 : V2;
11758 // Cast the result back to the correct type.
11759 return DAG.getBitcast(VT, V);
11762 /// \brief Generic lowering of 8-lane i16 shuffles.
11764 /// This handles both single-input shuffles and combined shuffle/blends with
11765 /// two inputs. The single input shuffles are immediately delegated to
11766 /// a dedicated lowering routine.
11768 /// The blends are lowered in one of three fundamental ways. If there are few
11769 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11770 /// of the input is significantly cheaper when lowered as an interleaving of
11771 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11772 /// halves of the inputs separately (making them have relatively few inputs)
11773 /// and then concatenate them.
11774 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11775 const APInt &Zeroable,
11776 SDValue V1, SDValue V2,
11777 const X86Subtarget &Subtarget,
11778 SelectionDAG &DAG) {
11779 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11780 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11781 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11783 // Whenever we can lower this as a zext, that instruction is strictly faster
11784 // than any alternative.
11785 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11786 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11789 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11791 if (NumV2Inputs == 0) {
11792 // Check for being able to broadcast a single element.
11793 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11794 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11797 // Try to use shift instructions.
11798 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11799 Zeroable, Subtarget, DAG))
11802 // Use dedicated unpack instructions for masks that match their pattern.
11804 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11807 // Use dedicated pack instructions for masks that match their pattern.
11808 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
11812 // Try to use byte rotation instructions.
11813 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11814 Mask, Subtarget, DAG))
11817 // Make a copy of the mask so it can be modified.
11818 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11819 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11820 MutableMask, Subtarget,
11824 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11825 "All single-input shuffles should be canonicalized to be V1-input "
11828 // Try to use shift instructions.
11829 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11830 Zeroable, Subtarget, DAG))
11833 // See if we can use SSE4A Extraction / Insertion.
11834 if (Subtarget.hasSSE4A())
11835 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11839 // There are special ways we can lower some single-element blends.
11840 if (NumV2Inputs == 1)
11841 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11842 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11845 // We have different paths for blend lowering, but they all must use the
11846 // *exact* same predicate.
11847 bool IsBlendSupported = Subtarget.hasSSE41();
11848 if (IsBlendSupported)
11849 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11850 Zeroable, Subtarget, DAG))
11853 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11857 // Use dedicated unpack instructions for masks that match their pattern.
11859 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11862 // Use dedicated pack instructions for masks that match their pattern.
11863 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
11867 // Try to use byte rotation instructions.
11868 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11869 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11872 if (SDValue BitBlend =
11873 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11876 // Try to lower by permuting the inputs into an unpack instruction.
11877 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11881 // If we can't directly blend but can use PSHUFB, that will be better as it
11882 // can both shuffle and set up the inefficient blend.
11883 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11884 bool V1InUse, V2InUse;
11885 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11886 Zeroable, DAG, V1InUse, V2InUse);
11889 // We can always bit-blend if we have to so the fallback strategy is to
11890 // decompose into single-input permutes and blends.
11891 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11895 /// \brief Check whether a compaction lowering can be done by dropping even
11896 /// elements and compute how many times even elements must be dropped.
11898 /// This handles shuffles which take every Nth element where N is a power of
11899 /// two. Example shuffle masks:
11901 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11902 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11903 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11904 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11905 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11906 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11908 /// Any of these lanes can of course be undef.
11910 /// This routine only supports N <= 3.
11911 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11914 /// \returns N above, or the number of times even elements must be dropped if
11915 /// there is such a number. Otherwise returns zero.
11916 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11917 bool IsSingleInput) {
11918 // The modulus for the shuffle vector entries is based on whether this is
11919 // a single input or not.
11920 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11921 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11922 "We should only be called with masks with a power-of-2 size!");
11924 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11926 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11927 // and 2^3 simultaneously. This is because we may have ambiguity with
11928 // partially undef inputs.
11929 bool ViableForN[3] = {true, true, true};
11931 for (int i = 0, e = Mask.size(); i < e; ++i) {
11932 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11937 bool IsAnyViable = false;
11938 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11939 if (ViableForN[j]) {
11940 uint64_t N = j + 1;
11942 // The shuffle mask must be equal to (i * 2^N) % M.
11943 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11944 IsAnyViable = true;
11946 ViableForN[j] = false;
11948 // Early exit if we exhaust the possible powers of two.
11953 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11957 // Return 0 as there is no viable power of two.
11961 /// \brief Generic lowering of v16i8 shuffles.
11963 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11964 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11965 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11966 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11968 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11969 const APInt &Zeroable,
11970 SDValue V1, SDValue V2,
11971 const X86Subtarget &Subtarget,
11972 SelectionDAG &DAG) {
11973 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11974 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11975 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11977 // Try to use shift instructions.
11978 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11979 Zeroable, Subtarget, DAG))
11982 // Try to use byte rotation instructions.
11983 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11984 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11987 // Use dedicated pack instructions for masks that match their pattern.
11988 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
11992 // Try to use a zext lowering.
11993 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11994 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11997 // See if we can use SSE4A Extraction / Insertion.
11998 if (Subtarget.hasSSE4A())
11999 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12003 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12005 // For single-input shuffles, there are some nicer lowering tricks we can use.
12006 if (NumV2Elements == 0) {
12007 // Check for being able to broadcast a single element.
12008 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12009 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12012 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12013 // Notably, this handles splat and partial-splat shuffles more efficiently.
12014 // However, it only makes sense if the pre-duplication shuffle simplifies
12015 // things significantly. Currently, this means we need to be able to
12016 // express the pre-duplication shuffle as an i16 shuffle.
12018 // FIXME: We should check for other patterns which can be widened into an
12019 // i16 shuffle as well.
12020 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12021 for (int i = 0; i < 16; i += 2)
12022 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12027 auto tryToWidenViaDuplication = [&]() -> SDValue {
12028 if (!canWidenViaDuplication(Mask))
12030 SmallVector<int, 4> LoInputs;
12031 copy_if(Mask, std::back_inserter(LoInputs),
12032 [](int M) { return M >= 0 && M < 8; });
12033 std::sort(LoInputs.begin(), LoInputs.end());
12034 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12036 SmallVector<int, 4> HiInputs;
12037 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12038 std::sort(HiInputs.begin(), HiInputs.end());
12039 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12042 bool TargetLo = LoInputs.size() >= HiInputs.size();
12043 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12044 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12046 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12047 SmallDenseMap<int, int, 8> LaneMap;
12048 for (int I : InPlaceInputs) {
12049 PreDupI16Shuffle[I/2] = I/2;
12052 int j = TargetLo ? 0 : 4, je = j + 4;
12053 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12054 // Check if j is already a shuffle of this input. This happens when
12055 // there are two adjacent bytes after we move the low one.
12056 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12057 // If we haven't yet mapped the input, search for a slot into which
12059 while (j < je && PreDupI16Shuffle[j] >= 0)
12063 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12066 // Map this input with the i16 shuffle.
12067 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12070 // Update the lane map based on the mapping we ended up with.
12071 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12073 V1 = DAG.getBitcast(
12075 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12076 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12078 // Unpack the bytes to form the i16s that will be shuffled into place.
12079 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12080 MVT::v16i8, V1, V1);
12082 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12083 for (int i = 0; i < 16; ++i)
12084 if (Mask[i] >= 0) {
12085 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12086 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12087 if (PostDupI16Shuffle[i / 2] < 0)
12088 PostDupI16Shuffle[i / 2] = MappedMask;
12090 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12091 "Conflicting entries in the original shuffle!");
12093 return DAG.getBitcast(
12095 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12096 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12098 if (SDValue V = tryToWidenViaDuplication())
12102 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12106 // Use dedicated unpack instructions for masks that match their pattern.
12108 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12111 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12112 // with PSHUFB. It is important to do this before we attempt to generate any
12113 // blends but after all of the single-input lowerings. If the single input
12114 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12115 // want to preserve that and we can DAG combine any longer sequences into
12116 // a PSHUFB in the end. But once we start blending from multiple inputs,
12117 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12118 // and there are *very* few patterns that would actually be faster than the
12119 // PSHUFB approach because of its ability to zero lanes.
12121 // FIXME: The only exceptions to the above are blends which are exact
12122 // interleavings with direct instructions supporting them. We currently don't
12123 // handle those well here.
12124 if (Subtarget.hasSSSE3()) {
12125 bool V1InUse = false;
12126 bool V2InUse = false;
12128 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12129 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12131 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12132 // do so. This avoids using them to handle blends-with-zero which is
12133 // important as a single pshufb is significantly faster for that.
12134 if (V1InUse && V2InUse) {
12135 if (Subtarget.hasSSE41())
12136 if (SDValue Blend = lowerVectorShuffleAsBlend(
12137 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12140 // We can use an unpack to do the blending rather than an or in some
12141 // cases. Even though the or may be (very minorly) more efficient, we
12142 // preference this lowering because there are common cases where part of
12143 // the complexity of the shuffles goes away when we do the final blend as
12145 // FIXME: It might be worth trying to detect if the unpack-feeding
12146 // shuffles will both be pshufb, in which case we shouldn't bother with
12148 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12149 DL, MVT::v16i8, V1, V2, Mask, DAG))
12156 // There are special ways we can lower some single-element blends.
12157 if (NumV2Elements == 1)
12158 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12159 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12162 if (SDValue BitBlend =
12163 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12166 // Check whether a compaction lowering can be done. This handles shuffles
12167 // which take every Nth element for some even N. See the helper function for
12170 // We special case these as they can be particularly efficiently handled with
12171 // the PACKUSB instruction on x86 and they show up in common patterns of
12172 // rearranging bytes to truncate wide elements.
12173 bool IsSingleInput = V2.isUndef();
12174 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12175 // NumEvenDrops is the power of two stride of the elements. Another way of
12176 // thinking about it is that we need to drop the even elements this many
12177 // times to get the original input.
12179 // First we need to zero all the dropped bytes.
12180 assert(NumEvenDrops <= 3 &&
12181 "No support for dropping even elements more than 3 times.");
12182 // We use the mask type to pick which bytes are preserved based on how many
12183 // elements are dropped.
12184 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12185 SDValue ByteClearMask = DAG.getBitcast(
12186 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12187 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12188 if (!IsSingleInput)
12189 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12191 // Now pack things back together.
12192 V1 = DAG.getBitcast(MVT::v8i16, V1);
12193 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12194 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12195 for (int i = 1; i < NumEvenDrops; ++i) {
12196 Result = DAG.getBitcast(MVT::v8i16, Result);
12197 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12203 // Handle multi-input cases by blending single-input shuffles.
12204 if (NumV2Elements > 0)
12205 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12208 // The fallback path for single-input shuffles widens this into two v8i16
12209 // vectors with unpacks, shuffles those, and then pulls them back together
12213 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12214 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12215 for (int i = 0; i < 16; ++i)
12217 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12219 SDValue VLoHalf, VHiHalf;
12220 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12221 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12223 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12224 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12225 // Use a mask to drop the high bytes.
12226 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12227 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12228 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12230 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12231 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12233 // Squash the masks to point directly into VLoHalf.
12234 for (int &M : LoBlendMask)
12237 for (int &M : HiBlendMask)
12241 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12242 // VHiHalf so that we can blend them as i16s.
12243 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12245 VLoHalf = DAG.getBitcast(
12246 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12247 VHiHalf = DAG.getBitcast(
12248 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12251 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12252 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12254 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12257 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
12259 /// This routine breaks down the specific type of 128-bit shuffle and
12260 /// dispatches to the lowering routines accordingly.
12261 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12262 MVT VT, SDValue V1, SDValue V2,
12263 const APInt &Zeroable,
12264 const X86Subtarget &Subtarget,
12265 SelectionDAG &DAG) {
12266 switch (VT.SimpleTy) {
12268 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12270 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12272 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12274 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12276 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12278 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12281 llvm_unreachable("Unimplemented!");
12285 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
12287 /// This routine just extracts two subvectors, shuffles them independently, and
12288 /// then concatenates them back together. This should work effectively with all
12289 /// AVX vector shuffle types.
12290 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12291 SDValue V2, ArrayRef<int> Mask,
12292 SelectionDAG &DAG) {
12293 assert(VT.getSizeInBits() >= 256 &&
12294 "Only for 256-bit or wider vector shuffles!");
12295 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12296 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12298 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12299 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12301 int NumElements = VT.getVectorNumElements();
12302 int SplitNumElements = NumElements / 2;
12303 MVT ScalarVT = VT.getVectorElementType();
12304 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12306 // Rather than splitting build-vectors, just build two narrower build
12307 // vectors. This helps shuffling with splats and zeros.
12308 auto SplitVector = [&](SDValue V) {
12309 V = peekThroughBitcasts(V);
12311 MVT OrigVT = V.getSimpleValueType();
12312 int OrigNumElements = OrigVT.getVectorNumElements();
12313 int OrigSplitNumElements = OrigNumElements / 2;
12314 MVT OrigScalarVT = OrigVT.getVectorElementType();
12315 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12319 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12321 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12322 DAG.getIntPtrConstant(0, DL));
12323 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12324 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12327 SmallVector<SDValue, 16> LoOps, HiOps;
12328 for (int i = 0; i < OrigSplitNumElements; ++i) {
12329 LoOps.push_back(BV->getOperand(i));
12330 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12332 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12333 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12335 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12336 DAG.getBitcast(SplitVT, HiV));
12339 SDValue LoV1, HiV1, LoV2, HiV2;
12340 std::tie(LoV1, HiV1) = SplitVector(V1);
12341 std::tie(LoV2, HiV2) = SplitVector(V2);
12343 // Now create two 4-way blends of these half-width vectors.
12344 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12345 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12346 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12347 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12348 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12349 for (int i = 0; i < SplitNumElements; ++i) {
12350 int M = HalfMask[i];
12351 if (M >= NumElements) {
12352 if (M >= NumElements + SplitNumElements)
12356 V2BlendMask[i] = M - NumElements;
12357 BlendMask[i] = SplitNumElements + i;
12358 } else if (M >= 0) {
12359 if (M >= SplitNumElements)
12363 V1BlendMask[i] = M;
12368 // Because the lowering happens after all combining takes place, we need to
12369 // manually combine these blend masks as much as possible so that we create
12370 // a minimal number of high-level vector shuffle nodes.
12372 // First try just blending the halves of V1 or V2.
12373 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12374 return DAG.getUNDEF(SplitVT);
12375 if (!UseLoV2 && !UseHiV2)
12376 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12377 if (!UseLoV1 && !UseHiV1)
12378 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12380 SDValue V1Blend, V2Blend;
12381 if (UseLoV1 && UseHiV1) {
12383 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12385 // We only use half of V1 so map the usage down into the final blend mask.
12386 V1Blend = UseLoV1 ? LoV1 : HiV1;
12387 for (int i = 0; i < SplitNumElements; ++i)
12388 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12389 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12391 if (UseLoV2 && UseHiV2) {
12393 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12395 // We only use half of V2 so map the usage down into the final blend mask.
12396 V2Blend = UseLoV2 ? LoV2 : HiV2;
12397 for (int i = 0; i < SplitNumElements; ++i)
12398 if (BlendMask[i] >= SplitNumElements)
12399 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12401 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12403 SDValue Lo = HalfBlend(LoMask);
12404 SDValue Hi = HalfBlend(HiMask);
12405 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12408 /// \brief Either split a vector in halves or decompose the shuffles and the
12411 /// This is provided as a good fallback for many lowerings of non-single-input
12412 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12413 /// between splitting the shuffle into 128-bit components and stitching those
12414 /// back together vs. extracting the single-input shuffles and blending those
12416 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12417 SDValue V1, SDValue V2,
12418 ArrayRef<int> Mask,
12419 SelectionDAG &DAG) {
12420 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12421 "shuffles as it could then recurse on itself.");
12422 int Size = Mask.size();
12424 // If this can be modeled as a broadcast of two elements followed by a blend,
12425 // prefer that lowering. This is especially important because broadcasts can
12426 // often fold with memory operands.
12427 auto DoBothBroadcast = [&] {
12428 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12431 if (V2BroadcastIdx < 0)
12432 V2BroadcastIdx = M - Size;
12433 else if (M - Size != V2BroadcastIdx)
12435 } else if (M >= 0) {
12436 if (V1BroadcastIdx < 0)
12437 V1BroadcastIdx = M;
12438 else if (M != V1BroadcastIdx)
12443 if (DoBothBroadcast())
12444 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12447 // If the inputs all stem from a single 128-bit lane of each input, then we
12448 // split them rather than blending because the split will decompose to
12449 // unusually few instructions.
12450 int LaneCount = VT.getSizeInBits() / 128;
12451 int LaneSize = Size / LaneCount;
12452 SmallBitVector LaneInputs[2];
12453 LaneInputs[0].resize(LaneCount, false);
12454 LaneInputs[1].resize(LaneCount, false);
12455 for (int i = 0; i < Size; ++i)
12457 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12458 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12459 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12461 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12462 // that the decomposed single-input shuffles don't end up here.
12463 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12466 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12467 /// a permutation and blend of those lanes.
12469 /// This essentially blends the out-of-lane inputs to each lane into the lane
12470 /// from a permuted copy of the vector. This lowering strategy results in four
12471 /// instructions in the worst case for a single-input cross lane shuffle which
12472 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12473 /// of. Special cases for each particular shuffle pattern should be handled
12474 /// prior to trying this lowering.
12475 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12476 SDValue V1, SDValue V2,
12477 ArrayRef<int> Mask,
12479 const X86Subtarget &Subtarget) {
12480 // FIXME: This should probably be generalized for 512-bit vectors as well.
12481 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12482 int Size = Mask.size();
12483 int LaneSize = Size / 2;
12485 // If there are only inputs from one 128-bit lane, splitting will in fact be
12486 // less expensive. The flags track whether the given lane contains an element
12487 // that crosses to another lane.
12488 if (!Subtarget.hasAVX2()) {
12489 bool LaneCrossing[2] = {false, false};
12490 for (int i = 0; i < Size; ++i)
12491 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12492 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12493 if (!LaneCrossing[0] || !LaneCrossing[1])
12494 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12496 bool LaneUsed[2] = {false, false};
12497 for (int i = 0; i < Size; ++i)
12499 LaneUsed[(Mask[i] / LaneSize)] = true;
12500 if (!LaneUsed[0] || !LaneUsed[1])
12501 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12504 assert(V2.isUndef() &&
12505 "This last part of this routine only works on single input shuffles");
12507 SmallVector<int, 32> FlippedBlendMask(Size);
12508 for (int i = 0; i < Size; ++i)
12509 FlippedBlendMask[i] =
12510 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12512 : Mask[i] % LaneSize +
12513 (i / LaneSize) * LaneSize + Size);
12515 // Flip the vector, and blend the results which should now be in-lane.
12516 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
12517 SDValue Flipped = DAG.getBitcast(PVT, V1);
12518 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
12520 Flipped = DAG.getBitcast(VT, Flipped);
12521 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12524 /// \brief Handle lowering 2-lane 128-bit shuffles.
12525 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12526 SDValue V2, ArrayRef<int> Mask,
12527 const APInt &Zeroable,
12528 const X86Subtarget &Subtarget,
12529 SelectionDAG &DAG) {
12530 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
12531 if (Subtarget.hasAVX2() && V2.isUndef())
12534 SmallVector<int, 4> WidenedMask;
12535 if (!canWidenShuffleElements(Mask, WidenedMask))
12538 // TODO: If minimizing size and one of the inputs is a zero vector and the
12539 // the zero vector has only one use, we could use a VPERM2X128 to save the
12540 // instruction bytes needed to explicitly generate the zero vector.
12542 // Blends are faster and handle all the non-lane-crossing cases.
12543 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12544 Zeroable, Subtarget, DAG))
12547 bool IsLowZero = (Zeroable & 0x3) == 0x3;
12548 bool IsHighZero = (Zeroable & 0xc) == 0xc;
12550 // If either input operand is a zero vector, use VPERM2X128 because its mask
12551 // allows us to replace the zero input with an implicit zero.
12552 if (!IsLowZero && !IsHighZero) {
12553 // Check for patterns which can be matched with a single insert of a 128-bit
12555 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12556 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12558 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12559 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12560 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12561 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12562 VT.getVectorNumElements() / 2);
12563 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12564 DAG.getIntPtrConstant(0, DL));
12565 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12566 OnlyUsesV1 ? V1 : V2,
12567 DAG.getIntPtrConstant(0, DL));
12568 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12572 // Try to use SHUF128 if possible.
12573 if (Subtarget.hasVLX()) {
12574 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
12575 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
12576 ((WidenedMask[1] % 2) << 1);
12577 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
12578 DAG.getConstant(PermMask, DL, MVT::i8));
12583 // Otherwise form a 128-bit permutation. After accounting for undefs,
12584 // convert the 64-bit shuffle mask selection values into 128-bit
12585 // selection bits by dividing the indexes by 2 and shifting into positions
12586 // defined by a vperm2*128 instruction's immediate control byte.
12588 // The immediate permute control byte looks like this:
12589 // [1:0] - select 128 bits from sources for low half of destination
12591 // [3] - zero low half of destination
12592 // [5:4] - select 128 bits from sources for high half of destination
12594 // [7] - zero high half of destination
12596 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
12598 unsigned PermMask = 0;
12599 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
12600 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
12602 // Check the immediate mask and replace unused sources with undef.
12603 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
12604 V1 = DAG.getUNDEF(VT);
12605 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
12606 V2 = DAG.getUNDEF(VT);
12608 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12609 DAG.getConstant(PermMask, DL, MVT::i8));
12612 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12613 /// shuffling each lane.
12615 /// This will only succeed when the result of fixing the 128-bit lanes results
12616 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12617 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12618 /// the lane crosses early and then use simpler shuffles within each lane.
12620 /// FIXME: It might be worthwhile at some point to support this without
12621 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12622 /// in x86 only floating point has interesting non-repeating shuffles, and even
12623 /// those are still *marginally* more expensive.
12624 static SDValue lowerVectorShuffleByMerging128BitLanes(
12625 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12626 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12627 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12629 int Size = Mask.size();
12630 int LaneSize = 128 / VT.getScalarSizeInBits();
12631 int NumLanes = Size / LaneSize;
12632 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12634 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12635 // check whether the in-128-bit lane shuffles share a repeating pattern.
12636 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12637 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12638 for (int i = 0; i < Size; ++i) {
12642 int j = i / LaneSize;
12644 if (Lanes[j] < 0) {
12645 // First entry we've seen for this lane.
12646 Lanes[j] = Mask[i] / LaneSize;
12647 } else if (Lanes[j] != Mask[i] / LaneSize) {
12648 // This doesn't match the lane selected previously!
12652 // Check that within each lane we have a consistent shuffle mask.
12653 int k = i % LaneSize;
12654 if (InLaneMask[k] < 0) {
12655 InLaneMask[k] = Mask[i] % LaneSize;
12656 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12657 // This doesn't fit a repeating in-lane mask.
12662 // First shuffle the lanes into place.
12663 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12664 VT.getSizeInBits() / 64);
12665 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12666 for (int i = 0; i < NumLanes; ++i)
12667 if (Lanes[i] >= 0) {
12668 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12669 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12672 V1 = DAG.getBitcast(LaneVT, V1);
12673 V2 = DAG.getBitcast(LaneVT, V2);
12674 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12676 // Cast it back to the type we actually want.
12677 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12679 // Now do a simple shuffle that isn't lane crossing.
12680 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12681 for (int i = 0; i < Size; ++i)
12683 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12684 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12685 "Must not introduce lane crosses at this point!");
12687 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12690 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
12691 /// This allows for fast cases such as subvector extraction/insertion
12692 /// or shuffling smaller vector types which can lower more efficiently.
12693 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12694 SDValue V1, SDValue V2,
12695 ArrayRef<int> Mask,
12696 const X86Subtarget &Subtarget,
12697 SelectionDAG &DAG) {
12698 assert((VT.is256BitVector() || VT.is512BitVector()) &&
12699 "Expected 256-bit or 512-bit vector");
12701 unsigned NumElts = VT.getVectorNumElements();
12702 unsigned HalfNumElts = NumElts / 2;
12703 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12705 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12706 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12707 if (!UndefLower && !UndefUpper)
12710 // Upper half is undef and lower half is whole upper subvector.
12711 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12713 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12714 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12715 DAG.getIntPtrConstant(HalfNumElts, DL));
12716 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12717 DAG.getIntPtrConstant(0, DL));
12720 // Lower half is undef and upper half is whole lower subvector.
12721 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12723 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12724 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12725 DAG.getIntPtrConstant(0, DL));
12726 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12727 DAG.getIntPtrConstant(HalfNumElts, DL));
12730 // If the shuffle only uses two of the four halves of the input operands,
12731 // then extract them and perform the 'half' shuffle at half width.
12732 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12733 int HalfIdx1 = -1, HalfIdx2 = -1;
12734 SmallVector<int, 8> HalfMask(HalfNumElts);
12735 unsigned Offset = UndefLower ? HalfNumElts : 0;
12736 for (unsigned i = 0; i != HalfNumElts; ++i) {
12737 int M = Mask[i + Offset];
12743 // Determine which of the 4 half vectors this element is from.
12744 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12745 int HalfIdx = M / HalfNumElts;
12747 // Determine the element index into its half vector source.
12748 int HalfElt = M % HalfNumElts;
12750 // We can shuffle with up to 2 half vectors, set the new 'half'
12751 // shuffle mask accordingly.
12752 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12753 HalfMask[i] = HalfElt;
12754 HalfIdx1 = HalfIdx;
12757 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12758 HalfMask[i] = HalfElt + HalfNumElts;
12759 HalfIdx2 = HalfIdx;
12763 // Too many half vectors referenced.
12766 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12768 // Only shuffle the halves of the inputs when useful.
12769 int NumLowerHalves =
12770 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12771 int NumUpperHalves =
12772 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12774 // uuuuXXXX - don't extract uppers just to insert again.
12775 if (UndefLower && NumUpperHalves != 0)
12778 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12779 if (UndefUpper && NumUpperHalves == 2)
12782 // AVX2 - XXXXuuuu - always extract lowers.
12783 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12784 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12785 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12787 // AVX2 supports variable 32-bit element cross-lane shuffles.
12788 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12789 // XXXXuuuu - don't extract lowers and uppers.
12790 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12795 // AVX512 - XXXXuuuu - always extract lowers.
12796 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
12799 auto GetHalfVector = [&](int HalfIdx) {
12801 return DAG.getUNDEF(HalfVT);
12802 SDValue V = (HalfIdx < 2 ? V1 : V2);
12803 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12804 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12805 DAG.getIntPtrConstant(HalfIdx, DL));
12808 SDValue Half1 = GetHalfVector(HalfIdx1);
12809 SDValue Half2 = GetHalfVector(HalfIdx2);
12810 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12811 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12812 DAG.getIntPtrConstant(Offset, DL));
12815 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12818 /// This returns true if the elements from a particular input are already in the
12819 /// slot required by the given mask and require no permutation.
12820 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12821 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12822 int Size = Mask.size();
12823 for (int i = 0; i < Size; ++i)
12824 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12830 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12831 /// every lane can be represented as the same repeating mask - allowing us to
12832 /// shuffle the sources with the repeating shuffle and then permute the result
12833 /// to the destination lanes.
12834 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12835 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12836 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12837 int NumElts = VT.getVectorNumElements();
12838 int NumLanes = VT.getSizeInBits() / 128;
12839 int NumLaneElts = NumElts / NumLanes;
12841 // On AVX2 we may be able to just shuffle the lowest elements and then
12842 // broadcast the result.
12843 if (Subtarget.hasAVX2()) {
12844 for (unsigned BroadcastSize : {16, 32, 64}) {
12845 if (BroadcastSize <= VT.getScalarSizeInBits())
12847 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12849 // Attempt to match a repeating pattern every NumBroadcastElts,
12850 // accounting for UNDEFs but only references the lowest 128-bit
12851 // lane of the inputs.
12852 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12853 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12854 for (int j = 0; j != NumBroadcastElts; ++j) {
12855 int M = Mask[i + j];
12858 int &R = RepeatMask[j];
12859 if (0 != ((M % NumElts) / NumLaneElts))
12861 if (0 <= R && R != M)
12868 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12869 if (!FindRepeatingBroadcastMask(RepeatMask))
12872 // Shuffle the (lowest) repeated elements in place for broadcast.
12873 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12875 // Shuffle the actual broadcast.
12876 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12877 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12878 for (int j = 0; j != NumBroadcastElts; ++j)
12879 BroadcastMask[i + j] = j;
12880 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12885 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12886 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12889 // Bail if we already have a repeated lane shuffle mask.
12890 SmallVector<int, 8> RepeatedShuffleMask;
12891 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12894 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12895 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12896 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12897 int NumSubLanes = NumLanes * SubLaneScale;
12898 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12900 // Check that all the sources are coming from the same lane and see if we can
12901 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12902 // determine the source sub-lane for each destination sub-lane.
12903 int TopSrcSubLane = -1;
12904 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12905 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12906 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12907 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12909 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12910 // Extract the sub-lane mask, check that it all comes from the same lane
12911 // and normalize the mask entries to come from the first lane.
12913 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12914 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12915 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12918 int Lane = (M % NumElts) / NumLaneElts;
12919 if ((0 <= SrcLane) && (SrcLane != Lane))
12922 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12923 SubLaneMask[Elt] = LocalM;
12926 // Whole sub-lane is UNDEF.
12930 // Attempt to match against the candidate repeated sub-lane masks.
12931 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12932 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12933 for (int i = 0; i != NumSubLaneElts; ++i) {
12934 if (M1[i] < 0 || M2[i] < 0)
12936 if (M1[i] != M2[i])
12942 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12943 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12946 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12947 for (int i = 0; i != NumSubLaneElts; ++i) {
12948 int M = SubLaneMask[i];
12951 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12952 "Unexpected mask element");
12953 RepeatedSubLaneMask[i] = M;
12956 // Track the top most source sub-lane - by setting the remaining to UNDEF
12957 // we can greatly simplify shuffle matching.
12958 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12959 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12960 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12964 // Bail if we failed to find a matching repeated sub-lane mask.
12965 if (Dst2SrcSubLanes[DstSubLane] < 0)
12968 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12969 "Unexpected source lane");
12971 // Create a repeating shuffle mask for the entire vector.
12972 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12973 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12974 int Lane = SubLane / SubLaneScale;
12975 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12976 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12977 int M = RepeatedSubLaneMask[Elt];
12980 int Idx = (SubLane * NumSubLaneElts) + Elt;
12981 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12984 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12986 // Shuffle each source sub-lane to its destination.
12987 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12988 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12989 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12990 if (SrcSubLane < 0)
12992 for (int j = 0; j != NumSubLaneElts; ++j)
12993 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12996 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13000 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13001 unsigned &ShuffleImm,
13002 ArrayRef<int> Mask) {
13003 int NumElts = VT.getVectorNumElements();
13004 assert(VT.getScalarSizeInBits() == 64 &&
13005 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13006 "Unexpected data type for VSHUFPD");
13008 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13009 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13011 bool ShufpdMask = true;
13012 bool CommutableMask = true;
13013 for (int i = 0; i < NumElts; ++i) {
13014 if (Mask[i] == SM_SentinelUndef)
13018 int Val = (i & 6) + NumElts * (i & 1);
13019 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13020 if (Mask[i] < Val || Mask[i] > Val + 1)
13021 ShufpdMask = false;
13022 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13023 CommutableMask = false;
13024 ShuffleImm |= (Mask[i] % 2) << i;
13029 if (CommutableMask) {
13037 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13038 ArrayRef<int> Mask, SDValue V1,
13039 SDValue V2, SelectionDAG &DAG) {
13040 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13041 "Unexpected data type for VSHUFPD");
13043 unsigned Immediate = 0;
13044 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13047 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13048 DAG.getConstant(Immediate, DL, MVT::i8));
13051 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
13052 ArrayRef<int> Mask, SDValue V1,
13053 SDValue V2, SelectionDAG &DAG) {
13054 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
13055 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
13057 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
13059 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
13061 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
13064 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
13066 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13067 /// isn't available.
13068 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13069 const APInt &Zeroable,
13070 SDValue V1, SDValue V2,
13071 const X86Subtarget &Subtarget,
13072 SelectionDAG &DAG) {
13073 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13074 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13075 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13077 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13078 Zeroable, Subtarget, DAG))
13081 if (V2.isUndef()) {
13082 // Check for being able to broadcast a single element.
13083 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13084 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13087 // Use low duplicate instructions for masks that match their pattern.
13088 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13089 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13091 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13092 // Non-half-crossing single input shuffles can be lowered with an
13093 // interleaved permutation.
13094 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13095 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13096 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13097 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13100 // With AVX2 we have direct support for this permutation.
13101 if (Subtarget.hasAVX2())
13102 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13103 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13105 // Try to create an in-lane repeating shuffle mask and then shuffle the
13106 // the results into the target lanes.
13107 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13108 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13111 // Otherwise, fall back.
13112 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13116 // Use dedicated unpack instructions for masks that match their pattern.
13118 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13121 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13122 Zeroable, Subtarget, DAG))
13125 // Check if the blend happens to exactly fit that of SHUFPD.
13127 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13130 // Try to create an in-lane repeating shuffle mask and then shuffle the
13131 // the results into the target lanes.
13132 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13133 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13136 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13137 // shuffle. However, if we have AVX2 and either inputs are already in place,
13138 // we will be able to shuffle even across lanes the other input in a single
13139 // instruction so skip this pattern.
13140 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13141 isShuffleMaskInputInPlace(1, Mask))))
13142 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13143 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13145 // If we have VLX support, we can use VEXPAND.
13146 if (Subtarget.hasVLX())
13147 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13148 V1, V2, DAG, Subtarget))
13151 // If we have AVX2 then we always want to lower with a blend because an v4 we
13152 // can fully permute the elements.
13153 if (Subtarget.hasAVX2())
13154 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13157 // Otherwise fall back on generic lowering.
13158 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13161 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
13163 /// This routine is only called when we have AVX2 and thus a reasonable
13164 /// instruction set for v4i64 shuffling..
13165 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13166 const APInt &Zeroable,
13167 SDValue V1, SDValue V2,
13168 const X86Subtarget &Subtarget,
13169 SelectionDAG &DAG) {
13170 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13171 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13172 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13173 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13175 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13176 Zeroable, Subtarget, DAG))
13179 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13180 Zeroable, Subtarget, DAG))
13183 // Check for being able to broadcast a single element.
13184 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13185 Mask, Subtarget, DAG))
13188 if (V2.isUndef()) {
13189 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13190 // can use lower latency instructions that will operate on both lanes.
13191 SmallVector<int, 2> RepeatedMask;
13192 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13193 SmallVector<int, 4> PSHUFDMask;
13194 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13195 return DAG.getBitcast(
13197 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13198 DAG.getBitcast(MVT::v8i32, V1),
13199 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13202 // AVX2 provides a direct instruction for permuting a single input across
13204 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13205 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13208 // Try to use shift instructions.
13209 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13210 Zeroable, Subtarget, DAG))
13213 // If we have VLX support, we can use VALIGN or VEXPAND.
13214 if (Subtarget.hasVLX()) {
13215 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13216 Mask, Subtarget, DAG))
13219 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13220 V1, V2, DAG, Subtarget))
13224 // Try to use PALIGNR.
13225 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13226 Mask, Subtarget, DAG))
13229 // Use dedicated unpack instructions for masks that match their pattern.
13231 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13234 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13235 // shuffle. However, if we have AVX2 and either inputs are already in place,
13236 // we will be able to shuffle even across lanes the other input in a single
13237 // instruction so skip this pattern.
13238 if (!isShuffleMaskInputInPlace(0, Mask) &&
13239 !isShuffleMaskInputInPlace(1, Mask))
13240 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13241 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13244 // Otherwise fall back on generic blend lowering.
13245 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13249 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
13251 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13252 /// isn't available.
13253 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13254 const APInt &Zeroable,
13255 SDValue V1, SDValue V2,
13256 const X86Subtarget &Subtarget,
13257 SelectionDAG &DAG) {
13258 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13259 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13260 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13262 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13263 Zeroable, Subtarget, DAG))
13266 // Check for being able to broadcast a single element.
13267 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13268 Mask, Subtarget, DAG))
13271 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13272 // options to efficiently lower the shuffle.
13273 SmallVector<int, 4> RepeatedMask;
13274 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13275 assert(RepeatedMask.size() == 4 &&
13276 "Repeated masks must be half the mask width!");
13278 // Use even/odd duplicate instructions for masks that match their pattern.
13279 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13280 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13281 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13282 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13285 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13286 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13288 // Use dedicated unpack instructions for masks that match their pattern.
13290 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13293 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13294 // have already handled any direct blends.
13295 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13298 // Try to create an in-lane repeating shuffle mask and then shuffle the
13299 // the results into the target lanes.
13300 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13301 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13304 // If we have a single input shuffle with different shuffle patterns in the
13305 // two 128-bit lanes use the variable mask to VPERMILPS.
13306 if (V2.isUndef()) {
13307 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13308 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13309 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13311 if (Subtarget.hasAVX2())
13312 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13314 // Otherwise, fall back.
13315 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13319 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13321 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13322 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13324 // If we have VLX support, we can use VEXPAND.
13325 if (Subtarget.hasVLX())
13326 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13327 V1, V2, DAG, Subtarget))
13330 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13331 // since after split we get a more efficient code using vpunpcklwd and
13332 // vpunpckhwd instrs than vblend.
13333 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13334 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13338 // If we have AVX2 then we always want to lower with a blend because at v8 we
13339 // can fully permute the elements.
13340 if (Subtarget.hasAVX2())
13341 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13344 // Otherwise fall back on generic lowering.
13345 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13348 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
13350 /// This routine is only called when we have AVX2 and thus a reasonable
13351 /// instruction set for v8i32 shuffling..
13352 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13353 const APInt &Zeroable,
13354 SDValue V1, SDValue V2,
13355 const X86Subtarget &Subtarget,
13356 SelectionDAG &DAG) {
13357 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13358 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13359 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13360 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13362 // Whenever we can lower this as a zext, that instruction is strictly faster
13363 // than any alternative. It also allows us to fold memory operands into the
13364 // shuffle in many cases.
13365 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13366 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13369 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13370 // since after split we get a more efficient code than vblend by using
13371 // vpunpcklwd and vpunpckhwd instrs.
13372 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13373 !Subtarget.hasAVX512())
13375 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13378 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13379 Zeroable, Subtarget, DAG))
13382 // Check for being able to broadcast a single element.
13383 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13384 Mask, Subtarget, DAG))
13387 // If the shuffle mask is repeated in each 128-bit lane we can use more
13388 // efficient instructions that mirror the shuffles across the two 128-bit
13390 SmallVector<int, 4> RepeatedMask;
13391 bool Is128BitLaneRepeatedShuffle =
13392 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13393 if (Is128BitLaneRepeatedShuffle) {
13394 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13396 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13397 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13399 // Use dedicated unpack instructions for masks that match their pattern.
13401 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13405 // Try to use shift instructions.
13406 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13407 Zeroable, Subtarget, DAG))
13410 // If we have VLX support, we can use VALIGN or EXPAND.
13411 if (Subtarget.hasVLX()) {
13412 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13413 Mask, Subtarget, DAG))
13416 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13417 V1, V2, DAG, Subtarget))
13421 // Try to use byte rotation instructions.
13422 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13423 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13426 // Try to create an in-lane repeating shuffle mask and then shuffle the
13427 // results into the target lanes.
13428 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13429 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13432 // If the shuffle patterns aren't repeated but it is a single input, directly
13433 // generate a cross-lane VPERMD instruction.
13434 if (V2.isUndef()) {
13435 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13436 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13439 // Assume that a single SHUFPS is faster than an alternative sequence of
13440 // multiple instructions (even if the CPU has a domain penalty).
13441 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13442 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13443 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13444 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13445 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13446 CastV1, CastV2, DAG);
13447 return DAG.getBitcast(MVT::v8i32, ShufPS);
13450 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13452 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13453 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13456 // Otherwise fall back on generic blend lowering.
13457 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13461 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13463 /// This routine is only called when we have AVX2 and thus a reasonable
13464 /// instruction set for v16i16 shuffling..
13465 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13466 const APInt &Zeroable,
13467 SDValue V1, SDValue V2,
13468 const X86Subtarget &Subtarget,
13469 SelectionDAG &DAG) {
13470 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13471 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13472 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13473 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13475 // Whenever we can lower this as a zext, that instruction is strictly faster
13476 // than any alternative. It also allows us to fold memory operands into the
13477 // shuffle in many cases.
13478 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13479 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13482 // Check for being able to broadcast a single element.
13483 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13484 Mask, Subtarget, DAG))
13487 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13488 Zeroable, Subtarget, DAG))
13491 // Use dedicated unpack instructions for masks that match their pattern.
13493 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13496 // Use dedicated pack instructions for masks that match their pattern.
13497 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
13501 // Try to use shift instructions.
13502 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13503 Zeroable, Subtarget, DAG))
13506 // Try to use byte rotation instructions.
13507 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13508 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13511 // Try to create an in-lane repeating shuffle mask and then shuffle the
13512 // the results into the target lanes.
13513 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13514 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13517 if (V2.isUndef()) {
13518 // There are no generalized cross-lane shuffle operations available on i16
13520 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13521 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13522 Mask, DAG, Subtarget);
13524 SmallVector<int, 8> RepeatedMask;
13525 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13526 // As this is a single-input shuffle, the repeated mask should be
13527 // a strictly valid v8i16 mask that we can pass through to the v8i16
13528 // lowering to handle even the v16 case.
13529 return lowerV8I16GeneralSingleInputVectorShuffle(
13530 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13534 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13535 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13538 // AVX512BWVL can lower to VPERMW.
13539 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13540 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13542 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13544 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13545 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13548 // Otherwise fall back on generic lowering.
13549 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13552 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13554 /// This routine is only called when we have AVX2 and thus a reasonable
13555 /// instruction set for v32i8 shuffling..
13556 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13557 const APInt &Zeroable,
13558 SDValue V1, SDValue V2,
13559 const X86Subtarget &Subtarget,
13560 SelectionDAG &DAG) {
13561 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13562 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13563 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13564 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13566 // Whenever we can lower this as a zext, that instruction is strictly faster
13567 // than any alternative. It also allows us to fold memory operands into the
13568 // shuffle in many cases.
13569 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13570 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13573 // Check for being able to broadcast a single element.
13574 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13575 Mask, Subtarget, DAG))
13578 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13579 Zeroable, Subtarget, DAG))
13582 // Use dedicated unpack instructions for masks that match their pattern.
13584 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13587 // Use dedicated pack instructions for masks that match their pattern.
13588 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
13592 // Try to use shift instructions.
13593 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13594 Zeroable, Subtarget, DAG))
13597 // Try to use byte rotation instructions.
13598 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13599 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13602 // Try to create an in-lane repeating shuffle mask and then shuffle the
13603 // the results into the target lanes.
13604 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13605 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13608 // There are no generalized cross-lane shuffle operations available on i8
13610 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13611 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13614 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13615 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13618 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13620 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13621 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13624 // Otherwise fall back on generic lowering.
13625 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13628 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13630 /// This routine either breaks down the specific type of a 256-bit x86 vector
13631 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13632 /// together based on the available instructions.
13633 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13634 MVT VT, SDValue V1, SDValue V2,
13635 const APInt &Zeroable,
13636 const X86Subtarget &Subtarget,
13637 SelectionDAG &DAG) {
13638 // If we have a single input to the zero element, insert that into V1 if we
13639 // can do so cheaply.
13640 int NumElts = VT.getVectorNumElements();
13641 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13643 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13644 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13645 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13648 // Handle special cases where the lower or upper half is UNDEF.
13650 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13653 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13654 // can check for those subtargets here and avoid much of the subtarget
13655 // querying in the per-vector-type lowering routines. With AVX1 we have
13656 // essentially *zero* ability to manipulate a 256-bit vector with integer
13657 // types. Since we'll use floating point types there eventually, just
13658 // immediately cast everything to a float and operate entirely in that domain.
13659 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13660 int ElementBits = VT.getScalarSizeInBits();
13661 if (ElementBits < 32) {
13662 // No floating point type available, if we can't use the bit operations
13663 // for masking/blending then decompose into 128-bit vectors.
13665 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13667 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13669 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13672 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13673 VT.getVectorNumElements());
13674 V1 = DAG.getBitcast(FpVT, V1);
13675 V2 = DAG.getBitcast(FpVT, V2);
13676 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13679 switch (VT.SimpleTy) {
13681 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13683 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13685 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13687 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13689 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13691 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13694 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13698 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13699 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13700 ArrayRef<int> Mask, SDValue V1,
13701 SDValue V2, SelectionDAG &DAG) {
13702 assert(VT.getScalarSizeInBits() == 64 &&
13703 "Unexpected element type size for 128bit shuffle.");
13705 // To handle 256 bit vector requires VLX and most probably
13706 // function lowerV2X128VectorShuffle() is better solution.
13707 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13709 SmallVector<int, 4> WidenedMask;
13710 if (!canWidenShuffleElements(Mask, WidenedMask))
13713 // Check for patterns which can be matched with a single insert of a 256-bit
13715 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13716 {0, 1, 2, 3, 0, 1, 2, 3});
13717 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13718 {0, 1, 2, 3, 8, 9, 10, 11})) {
13719 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13720 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13721 DAG.getIntPtrConstant(0, DL));
13722 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13723 OnlyUsesV1 ? V1 : V2,
13724 DAG.getIntPtrConstant(0, DL));
13725 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13728 assert(WidenedMask.size() == 4);
13730 // See if this is an insertion of the lower 128-bits of V2 into V1.
13731 bool IsInsert = true;
13733 for (int i = 0; i < 4; ++i) {
13734 assert(WidenedMask[i] >= -1);
13735 if (WidenedMask[i] < 0)
13738 // Make sure all V1 subvectors are in place.
13739 if (WidenedMask[i] < 4) {
13740 if (WidenedMask[i] != i) {
13745 // Make sure we only have a single V2 index and its the lowest 128-bits.
13746 if (V2Index >= 0 || WidenedMask[i] != 4) {
13753 if (IsInsert && V2Index >= 0) {
13754 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13755 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13756 DAG.getIntPtrConstant(0, DL));
13757 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13760 // Try to lower to to vshuf64x2/vshuf32x4.
13761 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13762 unsigned PermMask = 0;
13763 // Insure elements came from the same Op.
13764 for (int i = 0; i < 4; ++i) {
13765 assert(WidenedMask[i] >= -1);
13766 if (WidenedMask[i] < 0)
13769 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13770 unsigned OpIndex = i / 2;
13771 if (Ops[OpIndex].isUndef())
13773 else if (Ops[OpIndex] != Op)
13776 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13777 // bits defined by a vshuf64x2 instruction's immediate control byte.
13778 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13781 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13782 DAG.getConstant(PermMask, DL, MVT::i8));
13785 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13786 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13787 const APInt &Zeroable,
13788 SDValue V1, SDValue V2,
13789 const X86Subtarget &Subtarget,
13790 SelectionDAG &DAG) {
13791 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13792 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13793 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13795 if (V2.isUndef()) {
13796 // Use low duplicate instructions for masks that match their pattern.
13797 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13798 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13800 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13801 // Non-half-crossing single input shuffles can be lowered with an
13802 // interleaved permutation.
13803 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13804 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13805 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13806 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13807 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13808 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13811 SmallVector<int, 4> RepeatedMask;
13812 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13813 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13814 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13817 if (SDValue Shuf128 =
13818 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13821 if (SDValue Unpck =
13822 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13825 // Check if the blend happens to exactly fit that of SHUFPD.
13827 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13830 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13831 V2, DAG, Subtarget))
13834 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13835 Zeroable, Subtarget, DAG))
13838 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13841 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13842 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13843 const APInt &Zeroable,
13844 SDValue V1, SDValue V2,
13845 const X86Subtarget &Subtarget,
13846 SelectionDAG &DAG) {
13847 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13848 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13849 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13851 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13852 // options to efficiently lower the shuffle.
13853 SmallVector<int, 4> RepeatedMask;
13854 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13855 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13857 // Use even/odd duplicate instructions for masks that match their pattern.
13858 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13859 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13860 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13861 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13864 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13865 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13867 // Use dedicated unpack instructions for masks that match their pattern.
13868 if (SDValue Unpck =
13869 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13872 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13873 Zeroable, Subtarget, DAG))
13876 // Otherwise, fall back to a SHUFPS sequence.
13877 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13880 // If we have a single input shuffle with different shuffle patterns in the
13881 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
13882 if (V2.isUndef() &&
13883 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
13884 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
13885 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
13888 // If we have AVX512F support, we can use VEXPAND.
13889 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13890 V1, V2, DAG, Subtarget))
13893 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13896 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13897 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13898 const APInt &Zeroable,
13899 SDValue V1, SDValue V2,
13900 const X86Subtarget &Subtarget,
13901 SelectionDAG &DAG) {
13902 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13903 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13904 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13906 if (V2.isUndef()) {
13907 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13908 // can use lower latency instructions that will operate on all four
13910 SmallVector<int, 2> Repeated128Mask;
13911 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13912 SmallVector<int, 4> PSHUFDMask;
13913 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
13914 return DAG.getBitcast(
13916 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13917 DAG.getBitcast(MVT::v16i32, V1),
13918 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13921 SmallVector<int, 4> Repeated256Mask;
13922 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13923 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13924 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13927 if (SDValue Shuf128 =
13928 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13931 // Try to use shift instructions.
13932 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13933 Zeroable, Subtarget, DAG))
13936 // Try to use VALIGN.
13937 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13938 Mask, Subtarget, DAG))
13941 // Try to use PALIGNR.
13942 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13943 Mask, Subtarget, DAG))
13946 if (SDValue Unpck =
13947 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13949 // If we have AVX512F support, we can use VEXPAND.
13950 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13951 V2, DAG, Subtarget))
13954 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13955 Zeroable, Subtarget, DAG))
13958 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13961 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13962 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13963 const APInt &Zeroable,
13964 SDValue V1, SDValue V2,
13965 const X86Subtarget &Subtarget,
13966 SelectionDAG &DAG) {
13967 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13968 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13969 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13971 // Whenever we can lower this as a zext, that instruction is strictly faster
13972 // than any alternative. It also allows us to fold memory operands into the
13973 // shuffle in many cases.
13974 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13975 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13978 // If the shuffle mask is repeated in each 128-bit lane we can use more
13979 // efficient instructions that mirror the shuffles across the four 128-bit
13981 SmallVector<int, 4> RepeatedMask;
13982 bool Is128BitLaneRepeatedShuffle =
13983 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13984 if (Is128BitLaneRepeatedShuffle) {
13985 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13987 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13988 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13990 // Use dedicated unpack instructions for masks that match their pattern.
13992 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13996 // Try to use shift instructions.
13997 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13998 Zeroable, Subtarget, DAG))
14001 // Try to use VALIGN.
14002 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14003 Mask, Subtarget, DAG))
14006 // Try to use byte rotation instructions.
14007 if (Subtarget.hasBWI())
14008 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14009 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14012 // Assume that a single SHUFPS is faster than using a permv shuffle.
14013 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14014 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14015 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14016 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14017 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14018 CastV1, CastV2, DAG);
14019 return DAG.getBitcast(MVT::v16i32, ShufPS);
14021 // If we have AVX512F support, we can use VEXPAND.
14022 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14023 V1, V2, DAG, Subtarget))
14026 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14027 Zeroable, Subtarget, DAG))
14029 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14032 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
14033 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14034 const APInt &Zeroable,
14035 SDValue V1, SDValue V2,
14036 const X86Subtarget &Subtarget,
14037 SelectionDAG &DAG) {
14038 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14039 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14040 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14041 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14043 // Whenever we can lower this as a zext, that instruction is strictly faster
14044 // than any alternative. It also allows us to fold memory operands into the
14045 // shuffle in many cases.
14046 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14047 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14050 // Use dedicated unpack instructions for masks that match their pattern.
14052 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14055 // Try to use shift instructions.
14056 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14057 Zeroable, Subtarget, DAG))
14060 // Try to use byte rotation instructions.
14061 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14062 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14065 if (V2.isUndef()) {
14066 SmallVector<int, 8> RepeatedMask;
14067 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14068 // As this is a single-input shuffle, the repeated mask should be
14069 // a strictly valid v8i16 mask that we can pass through to the v8i16
14070 // lowering to handle even the v32 case.
14071 return lowerV8I16GeneralSingleInputVectorShuffle(
14072 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14076 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14077 Zeroable, Subtarget, DAG))
14080 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14083 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
14084 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14085 const APInt &Zeroable,
14086 SDValue V1, SDValue V2,
14087 const X86Subtarget &Subtarget,
14088 SelectionDAG &DAG) {
14089 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14090 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14091 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14092 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14094 // Whenever we can lower this as a zext, that instruction is strictly faster
14095 // than any alternative. It also allows us to fold memory operands into the
14096 // shuffle in many cases.
14097 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14098 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14101 // Use dedicated unpack instructions for masks that match their pattern.
14103 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14106 // Try to use shift instructions.
14107 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14108 Zeroable, Subtarget, DAG))
14111 // Try to use byte rotation instructions.
14112 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14113 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14116 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14117 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14120 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14121 if (Subtarget.hasVBMI())
14122 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14124 // Try to create an in-lane repeating shuffle mask and then shuffle the
14125 // the results into the target lanes.
14126 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14127 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14130 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14131 Zeroable, Subtarget, DAG))
14134 // FIXME: Implement direct support for this type!
14135 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14138 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
14140 /// This routine either breaks down the specific type of a 512-bit x86 vector
14141 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14142 /// together based on the available instructions.
14143 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14144 MVT VT, SDValue V1, SDValue V2,
14145 const APInt &Zeroable,
14146 const X86Subtarget &Subtarget,
14147 SelectionDAG &DAG) {
14148 assert(Subtarget.hasAVX512() &&
14149 "Cannot lower 512-bit vectors w/ basic ISA!");
14151 // If we have a single input to the zero element, insert that into V1 if we
14152 // can do so cheaply.
14153 int NumElts = Mask.size();
14154 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14156 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14157 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14158 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14161 // Handle special cases where the lower or upper half is UNDEF.
14163 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14166 // Check for being able to broadcast a single element.
14167 if (SDValue Broadcast =
14168 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14171 // Dispatch to each element type for lowering. If we don't have support for
14172 // specific element type shuffles at 512 bits, immediately split them and
14173 // lower them. Each lowering routine of a given type is allowed to assume that
14174 // the requisite ISA extensions for that element type are available.
14175 switch (VT.SimpleTy) {
14177 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14179 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14181 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14183 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14185 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14187 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14190 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14194 // Lower vXi1 vector shuffles.
14195 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14196 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14197 // vector, shuffle and then truncate it back.
14198 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14199 MVT VT, SDValue V1, SDValue V2,
14200 const X86Subtarget &Subtarget,
14201 SelectionDAG &DAG) {
14202 assert(Subtarget.hasAVX512() &&
14203 "Cannot lower 512-bit vectors w/o basic ISA!");
14205 switch (VT.SimpleTy) {
14207 llvm_unreachable("Expected a vector of i1 elements");
14209 ExtVT = MVT::v2i64;
14212 ExtVT = MVT::v4i32;
14215 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
14218 ExtVT = MVT::v16i32;
14221 ExtVT = MVT::v32i16;
14224 ExtVT = MVT::v64i8;
14228 if (ISD::isBuildVectorAllZeros(V1.getNode()))
14229 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14230 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
14231 V1 = getOnesVector(ExtVT, DAG, DL);
14233 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14236 V2 = DAG.getUNDEF(ExtVT);
14237 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
14238 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14239 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
14240 V2 = getOnesVector(ExtVT, DAG, DL);
14242 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14244 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14245 // i1 was sign extended we can use X86ISD::CVT2MASK.
14246 int NumElems = VT.getVectorNumElements();
14247 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14248 (Subtarget.hasDQI() && (NumElems < 32)))
14249 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
14251 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14254 /// Helper function that returns true if the shuffle mask should be
14255 /// commuted to improve canonicalization.
14256 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14257 int NumElements = Mask.size();
14259 int NumV1Elements = 0, NumV2Elements = 0;
14263 else if (M < NumElements)
14268 // Commute the shuffle as needed such that more elements come from V1 than
14269 // V2. This allows us to match the shuffle pattern strictly on how many
14270 // elements come from V1 without handling the symmetric cases.
14271 if (NumV2Elements > NumV1Elements)
14274 assert(NumV1Elements > 0 && "No V1 indices");
14276 if (NumV2Elements == 0)
14279 // When the number of V1 and V2 elements are the same, try to minimize the
14280 // number of uses of V2 in the low half of the vector. When that is tied,
14281 // ensure that the sum of indices for V1 is equal to or lower than the sum
14282 // indices for V2. When those are equal, try to ensure that the number of odd
14283 // indices for V1 is lower than the number of odd indices for V2.
14284 if (NumV1Elements == NumV2Elements) {
14285 int LowV1Elements = 0, LowV2Elements = 0;
14286 for (int M : Mask.slice(0, NumElements / 2))
14287 if (M >= NumElements)
14291 if (LowV2Elements > LowV1Elements)
14293 if (LowV2Elements == LowV1Elements) {
14294 int SumV1Indices = 0, SumV2Indices = 0;
14295 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14296 if (Mask[i] >= NumElements)
14298 else if (Mask[i] >= 0)
14300 if (SumV2Indices < SumV1Indices)
14302 if (SumV2Indices == SumV1Indices) {
14303 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14304 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14305 if (Mask[i] >= NumElements)
14306 NumV2OddIndices += i % 2;
14307 else if (Mask[i] >= 0)
14308 NumV1OddIndices += i % 2;
14309 if (NumV2OddIndices < NumV1OddIndices)
14318 /// \brief Top-level lowering for x86 vector shuffles.
14320 /// This handles decomposition, canonicalization, and lowering of all x86
14321 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14322 /// above in helper routines. The canonicalization attempts to widen shuffles
14323 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14324 /// s.t. only one of the two inputs needs to be tested, etc.
14325 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14326 SelectionDAG &DAG) {
14327 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14328 ArrayRef<int> Mask = SVOp->getMask();
14329 SDValue V1 = Op.getOperand(0);
14330 SDValue V2 = Op.getOperand(1);
14331 MVT VT = Op.getSimpleValueType();
14332 int NumElements = VT.getVectorNumElements();
14334 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14336 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
14337 "Can't lower MMX shuffles");
14339 bool V1IsUndef = V1.isUndef();
14340 bool V2IsUndef = V2.isUndef();
14341 if (V1IsUndef && V2IsUndef)
14342 return DAG.getUNDEF(VT);
14344 // When we create a shuffle node we put the UNDEF node to second operand,
14345 // but in some cases the first operand may be transformed to UNDEF.
14346 // In this case we should just commute the node.
14348 return DAG.getCommutedVectorShuffle(*SVOp);
14350 // Check for non-undef masks pointing at an undef vector and make the masks
14351 // undef as well. This makes it easier to match the shuffle based solely on
14355 if (M >= NumElements) {
14356 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
14357 for (int &M : NewMask)
14358 if (M >= NumElements)
14360 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14363 // Check for illegal shuffle mask element index values.
14364 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
14365 assert(llvm::all_of(Mask,
14366 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
14367 "Out of bounds shuffle index");
14369 // We actually see shuffles that are entirely re-arrangements of a set of
14370 // zero inputs. This mostly happens while decomposing complex shuffles into
14371 // simple ones. Directly lower these as a buildvector of zeros.
14372 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
14373 if (Zeroable.isAllOnesValue())
14374 return getZeroVector(VT, Subtarget, DAG, DL);
14376 // Try to collapse shuffles into using a vector type with fewer elements but
14377 // wider element types. We cap this to not form integers or floating point
14378 // elements wider than 64 bits, but it might be interesting to form i128
14379 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
14380 SmallVector<int, 16> WidenedMask;
14381 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
14382 canWidenShuffleElements(Mask, WidenedMask)) {
14383 MVT NewEltVT = VT.isFloatingPoint()
14384 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
14385 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
14386 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14387 // Make sure that the new vector type is legal. For example, v2f64 isn't
14389 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14390 V1 = DAG.getBitcast(NewVT, V1);
14391 V2 = DAG.getBitcast(NewVT, V2);
14392 return DAG.getBitcast(
14393 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
14397 // Commute the shuffle if it will improve canonicalization.
14398 if (canonicalizeShuffleMaskWithCommute(Mask))
14399 return DAG.getCommutedVectorShuffle(*SVOp);
14401 // For each vector width, delegate to a specialized lowering routine.
14402 if (VT.is128BitVector())
14403 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14406 if (VT.is256BitVector())
14407 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14410 if (VT.is512BitVector())
14411 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14415 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
14417 llvm_unreachable("Unimplemented!");
14420 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14421 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14422 const X86Subtarget &Subtarget,
14423 SelectionDAG &DAG) {
14424 SDValue Cond = Op.getOperand(0);
14425 SDValue LHS = Op.getOperand(1);
14426 SDValue RHS = Op.getOperand(2);
14428 MVT VT = Op.getSimpleValueType();
14430 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14432 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14434 // Only non-legal VSELECTs reach this lowering, convert those into generic
14435 // shuffles and re-use the shuffle lowering path for blends.
14436 SmallVector<int, 32> Mask;
14437 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14438 SDValue CondElt = CondBV->getOperand(i);
14440 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14443 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14446 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14447 // A vselect where all conditions and data are constants can be optimized into
14448 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14449 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14450 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14451 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14454 // Try to lower this to a blend-style vector shuffle. This can handle all
14455 // constant condition cases.
14456 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14459 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14460 // with patterns on the mask registers on AVX-512.
14461 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14464 // Variable blends are only legal from SSE4.1 onward.
14465 if (!Subtarget.hasSSE41())
14469 MVT VT = Op.getSimpleValueType();
14471 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14472 // into an i1 condition so that we can use the mask-based 512-bit blend
14474 if (VT.getSizeInBits() == 512) {
14475 SDValue Cond = Op.getOperand(0);
14476 // The vNi1 condition case should be handled above as it can be trivially
14478 assert(Cond.getValueType().getScalarSizeInBits() ==
14479 VT.getScalarSizeInBits() &&
14480 "Should have a size-matched integer condition!");
14481 // Build a mask by testing the condition against itself (tests for zero).
14482 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14483 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14484 // Now return a new VSELECT using the mask.
14485 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14488 // Only some types will be legal on some subtargets. If we can emit a legal
14489 // VSELECT-matching blend, return Op, and but if we need to expand, return
14491 switch (VT.SimpleTy) {
14493 // Most of the vector types have blends past SSE4.1.
14497 // The byte blends for AVX vectors were introduced only in AVX2.
14498 if (Subtarget.hasAVX2())
14505 // FIXME: We should custom lower this by fixing the condition and using i8
14511 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14512 MVT VT = Op.getSimpleValueType();
14515 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14518 if (VT.getSizeInBits() == 8) {
14519 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14520 Op.getOperand(0), Op.getOperand(1));
14521 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14524 if (VT == MVT::f32) {
14525 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14526 // the result back to FR32 register. It's only worth matching if the
14527 // result has a single use which is a store or a bitcast to i32. And in
14528 // the case of a store, it's not worth it if the index is a constant 0,
14529 // because a MOVSSmr can be used instead, which is smaller and faster.
14530 if (!Op.hasOneUse())
14532 SDNode *User = *Op.getNode()->use_begin();
14533 if ((User->getOpcode() != ISD::STORE ||
14534 isNullConstant(Op.getOperand(1))) &&
14535 (User->getOpcode() != ISD::BITCAST ||
14536 User->getValueType(0) != MVT::i32))
14538 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14539 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14541 return DAG.getBitcast(MVT::f32, Extract);
14544 if (VT == MVT::i32 || VT == MVT::i64) {
14545 // ExtractPS/pextrq works with constant index.
14546 if (isa<ConstantSDNode>(Op.getOperand(1)))
14553 /// Extract one bit from mask vector, like v16i1 or v8i1.
14554 /// AVX-512 feature.
14555 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
14556 const X86Subtarget &Subtarget) {
14557 SDValue Vec = Op.getOperand(0);
14559 MVT VecVT = Vec.getSimpleValueType();
14560 SDValue Idx = Op.getOperand(1);
14561 MVT EltVT = Op.getSimpleValueType();
14563 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14564 "Unexpected vector type in ExtractBitFromMaskVector");
14566 // variable index can't be handled in mask registers,
14567 // extend vector to VR512/128
14568 if (!isa<ConstantSDNode>(Idx)) {
14569 unsigned NumElts = VecVT.getVectorNumElements();
14570 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14571 // than extending to 128/256bit.
14572 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14573 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts);
14574 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14575 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14576 ExtVT.getVectorElementType(), Ext, Idx);
14577 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14580 // Canonicalize result type to MVT::i32.
14581 if (EltVT != MVT::i32) {
14582 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14584 return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
14587 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14589 // Extracts from element 0 are always allowed.
14593 // If the kshift instructions of the correct width aren't natively supported
14594 // then we need to promote the vector to the native size to get the correct
14595 // zeroing behavior.
14596 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14597 (VecVT.getVectorNumElements() < 8)) {
14598 VecVT = MVT::v16i1;
14599 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14600 DAG.getUNDEF(VecVT),
14602 DAG.getIntPtrConstant(0, dl));
14605 // Use kshiftr instruction to move to the lower element.
14606 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14607 DAG.getConstant(IdxVal, dl, MVT::i8));
14608 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
14609 DAG.getIntPtrConstant(0, dl));
14613 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14614 SelectionDAG &DAG) const {
14616 SDValue Vec = Op.getOperand(0);
14617 MVT VecVT = Vec.getSimpleValueType();
14618 SDValue Idx = Op.getOperand(1);
14620 if (VecVT.getVectorElementType() == MVT::i1)
14621 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
14623 if (!isa<ConstantSDNode>(Idx)) {
14624 // Its more profitable to go through memory (1 cycles throughput)
14625 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14626 // IACA tool was used to get performance estimation
14627 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14629 // example : extractelement <16 x i8> %a, i32 %i
14631 // Block Throughput: 3.00 Cycles
14632 // Throughput Bottleneck: Port5
14634 // | Num Of | Ports pressure in cycles | |
14635 // | Uops | 0 - DV | 5 | 6 | 7 | |
14636 // ---------------------------------------------
14637 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14638 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14639 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14640 // Total Num Of Uops: 4
14643 // Block Throughput: 1.00 Cycles
14644 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14646 // | | Ports pressure in cycles | |
14647 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14648 // ---------------------------------------------------------
14649 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14650 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14651 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14652 // Total Num Of Uops: 4
14657 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14659 // If this is a 256-bit vector result, first extract the 128-bit vector and
14660 // then extract the element from the 128-bit vector.
14661 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14662 // Get the 128-bit vector.
14663 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14664 MVT EltVT = VecVT.getVectorElementType();
14666 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14667 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14669 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14670 // this can be done with a mask.
14671 IdxVal &= ElemsPerChunk - 1;
14672 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14673 DAG.getConstant(IdxVal, dl, MVT::i32));
14676 assert(VecVT.is128BitVector() && "Unexpected vector length");
14678 MVT VT = Op.getSimpleValueType();
14680 if (VT.getSizeInBits() == 16) {
14681 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14682 // we're going to zero extend the register or fold the store (SSE41 only).
14683 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14684 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14685 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14686 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14687 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14689 // Transform it so it match pextrw which produces a 32-bit result.
14690 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14691 Op.getOperand(0), Op.getOperand(1));
14692 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14695 if (Subtarget.hasSSE41())
14696 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14699 // TODO: We only extract a single element from v16i8, we can probably afford
14700 // to be more aggressive here before using the default approach of spilling to
14702 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14703 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14704 int DWordIdx = IdxVal / 4;
14705 if (DWordIdx == 0) {
14706 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14707 DAG.getBitcast(MVT::v4i32, Vec),
14708 DAG.getIntPtrConstant(DWordIdx, dl));
14709 int ShiftVal = (IdxVal % 4) * 8;
14711 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14712 DAG.getConstant(ShiftVal, dl, MVT::i32));
14713 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14716 int WordIdx = IdxVal / 2;
14717 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14718 DAG.getBitcast(MVT::v8i16, Vec),
14719 DAG.getIntPtrConstant(WordIdx, dl));
14720 int ShiftVal = (IdxVal % 2) * 8;
14722 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14723 DAG.getConstant(ShiftVal, dl, MVT::i16));
14724 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14727 if (VT.getSizeInBits() == 32) {
14731 // SHUFPS the element to the lowest double word, then movss.
14732 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14733 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14734 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14735 DAG.getIntPtrConstant(0, dl));
14738 if (VT.getSizeInBits() == 64) {
14739 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14740 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14741 // to match extract_elt for f64.
14745 // UNPCKHPD the element to the lowest double word, then movsd.
14746 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14747 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14748 int Mask[2] = { 1, -1 };
14749 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14750 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14751 DAG.getIntPtrConstant(0, dl));
14757 /// Insert one bit to mask vector, like v16i1 or v8i1.
14758 /// AVX-512 feature.
14759 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
14760 const X86Subtarget &Subtarget) {
14762 SDValue Vec = Op.getOperand(0);
14763 SDValue Elt = Op.getOperand(1);
14764 SDValue Idx = Op.getOperand(2);
14765 MVT VecVT = Vec.getSimpleValueType();
14767 if (!isa<ConstantSDNode>(Idx)) {
14768 // Non constant index. Extend source and destination,
14769 // insert element and then truncate the result.
14770 unsigned NumElts = VecVT.getVectorNumElements();
14771 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14772 MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14773 MVT ExtEltVT = ExtVecVT.getVectorElementType();
14774 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14775 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14776 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14777 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14780 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14781 unsigned NumElems = VecVT.getVectorNumElements();
14783 // If the kshift instructions of the correct width aren't natively supported
14784 // then we need to promote the vector to the native size to get the correct
14785 // zeroing behavior.
14786 if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
14787 // Need to promote to v16i1, do the insert, then extract back.
14788 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
14789 DAG.getUNDEF(MVT::v16i1), Vec,
14790 DAG.getIntPtrConstant(0, dl));
14791 Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
14792 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
14793 DAG.getIntPtrConstant(0, dl));
14796 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14798 if (Vec.isUndef()) {
14800 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14801 DAG.getConstant(IdxVal, dl, MVT::i8));
14805 // Insertion of one bit into first position
14806 if (IdxVal == 0 ) {
14807 // Clean top bits of vector.
14808 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14809 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14810 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14811 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14812 // Clean the first bit in source vector.
14813 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14814 DAG.getConstant(1 , dl, MVT::i8));
14815 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14816 DAG.getConstant(1, dl, MVT::i8));
14818 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14820 // Insertion of one bit into last position
14821 if (IdxVal == NumElems - 1) {
14822 // Move the bit to the last position inside the vector.
14823 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14824 DAG.getConstant(IdxVal, dl, MVT::i8));
14825 // Clean the last bit in the source vector.
14826 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14827 DAG.getConstant(1, dl, MVT::i8));
14828 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14829 DAG.getConstant(1 , dl, MVT::i8));
14831 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14834 // Move the current value of the bit to be replace to bit 0.
14835 SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14836 DAG.getConstant(IdxVal, dl, MVT::i8));
14837 // Xor with the new bit.
14838 Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
14839 // Shift to MSB, filling bottom bits with 0.
14840 Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
14841 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14842 // Shift to the final position, filling upper bits with 0.
14843 Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
14844 DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
14845 // Xor with original vector to cancel out the original bit value that's still
14847 return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
14850 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14851 SelectionDAG &DAG) const {
14852 MVT VT = Op.getSimpleValueType();
14853 MVT EltVT = VT.getVectorElementType();
14854 unsigned NumElts = VT.getVectorNumElements();
14856 if (EltVT == MVT::i1)
14857 return InsertBitToMaskVector(Op, DAG, Subtarget);
14860 SDValue N0 = Op.getOperand(0);
14861 SDValue N1 = Op.getOperand(1);
14862 SDValue N2 = Op.getOperand(2);
14863 if (!isa<ConstantSDNode>(N2))
14865 auto *N2C = cast<ConstantSDNode>(N2);
14866 unsigned IdxVal = N2C->getZExtValue();
14868 bool IsZeroElt = X86::isZeroNode(N1);
14869 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14871 // If we are inserting a element, see if we can do this more efficiently with
14872 // a blend shuffle with a rematerializable vector than a costly integer
14874 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14875 16 <= EltVT.getSizeInBits()) {
14876 SmallVector<int, 8> BlendMask;
14877 for (unsigned i = 0; i != NumElts; ++i)
14878 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14879 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14880 : getOnesVector(VT, DAG, dl);
14881 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14884 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14885 // into that, and then insert the subvector back into the result.
14886 if (VT.is256BitVector() || VT.is512BitVector()) {
14887 // With a 256-bit vector, we can insert into the zero element efficiently
14888 // using a blend if we have AVX or AVX2 and the right data type.
14889 if (VT.is256BitVector() && IdxVal == 0) {
14890 // TODO: It is worthwhile to cast integer to floating point and back
14891 // and incur a domain crossing penalty if that's what we'll end up
14892 // doing anyway after extracting to a 128-bit vector.
14893 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14894 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14895 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14896 N2 = DAG.getIntPtrConstant(1, dl);
14897 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14901 // Get the desired 128-bit vector chunk.
14902 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14904 // Insert the element into the desired chunk.
14905 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14906 assert(isPowerOf2_32(NumEltsIn128));
14907 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14908 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14910 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14911 DAG.getConstant(IdxIn128, dl, MVT::i32));
14913 // Insert the changed part back into the bigger vector
14914 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14916 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14918 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14919 // argument. SSE41 required for pinsrb.
14920 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14922 if (VT == MVT::v8i16) {
14923 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14924 Opc = X86ISD::PINSRW;
14926 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14927 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14928 Opc = X86ISD::PINSRB;
14931 if (N1.getValueType() != MVT::i32)
14932 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14933 if (N2.getValueType() != MVT::i32)
14934 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14935 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14938 if (Subtarget.hasSSE41()) {
14939 if (EltVT == MVT::f32) {
14940 // Bits [7:6] of the constant are the source select. This will always be
14941 // zero here. The DAG Combiner may combine an extract_elt index into
14942 // these bits. For example (insert (extract, 3), 2) could be matched by
14943 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14944 // Bits [5:4] of the constant are the destination select. This is the
14945 // value of the incoming immediate.
14946 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14947 // combine either bitwise AND or insert of float 0.0 to set these bits.
14949 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
14950 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14951 // If this is an insertion of 32-bits into the low 32-bits of
14952 // a vector, we prefer to generate a blend with immediate rather
14953 // than an insertps. Blends are simpler operations in hardware and so
14954 // will always have equal or better performance than insertps.
14955 // But if optimizing for size and there's a load folding opportunity,
14956 // generate insertps because blendps does not have a 32-bit memory
14958 N2 = DAG.getIntPtrConstant(1, dl);
14959 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14960 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14962 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14963 // Create this as a scalar to vector..
14964 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14965 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14968 // PINSR* works with constant index.
14969 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14976 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14977 SelectionDAG &DAG) {
14979 MVT OpVT = Op.getSimpleValueType();
14981 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14983 if (X86::isZeroNode(Op.getOperand(0)))
14984 return getZeroVector(OpVT, Subtarget, DAG, dl);
14986 // If this is a 256-bit vector result, first insert into a 128-bit
14987 // vector and then insert into the 256-bit vector.
14988 if (!OpVT.is128BitVector()) {
14989 // Insert into a 128-bit vector.
14990 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14991 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14992 OpVT.getVectorNumElements() / SizeFactor);
14994 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14996 // Insert the 128-bit vector.
14997 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14999 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15001 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15002 if (OpVT == MVT::v4i32)
15005 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15006 return DAG.getBitcast(
15007 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15010 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15011 // simple superregister reference or explicit instructions to insert
15012 // the upper bits of a vector.
15013 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15014 SelectionDAG &DAG) {
15015 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15017 return insert1BitVector(Op, DAG, Subtarget);
15020 // Returns the appropriate wrapper opcode for a global reference.
15021 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
15022 // References to absolute symbols are never PC-relative.
15023 if (GV && GV->isAbsoluteSymbolRef())
15024 return X86ISD::Wrapper;
15026 CodeModel::Model M = getTargetMachine().getCodeModel();
15027 if (Subtarget.isPICStyleRIPRel() &&
15028 (M == CodeModel::Small || M == CodeModel::Kernel))
15029 return X86ISD::WrapperRIP;
15031 return X86ISD::Wrapper;
15034 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15035 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15036 // one of the above mentioned nodes. It has to be wrapped because otherwise
15037 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15038 // be used to form addressing mode. These wrapped nodes will be selected
15041 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15042 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15044 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15045 // global base reg.
15046 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15048 auto PtrVT = getPointerTy(DAG.getDataLayout());
15049 SDValue Result = DAG.getTargetConstantPool(
15050 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15052 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15053 // With PIC, the address is actually $g + Offset.
15056 DAG.getNode(ISD::ADD, DL, PtrVT,
15057 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15063 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15064 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15066 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15067 // global base reg.
15068 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15070 auto PtrVT = getPointerTy(DAG.getDataLayout());
15071 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15073 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15075 // With PIC, the address is actually $g + Offset.
15078 DAG.getNode(ISD::ADD, DL, PtrVT,
15079 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15085 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15086 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15088 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15089 // global base reg.
15090 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15091 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15093 auto PtrVT = getPointerTy(DAG.getDataLayout());
15094 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15097 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15099 // With PIC, the address is actually $g + Offset.
15100 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15102 DAG.getNode(ISD::ADD, DL, PtrVT,
15103 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15106 // For symbols that require a load from a stub to get the address, emit the
15108 if (isGlobalStubReference(OpFlag))
15109 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15110 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15116 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15117 // Create the TargetBlockAddressAddress node.
15118 unsigned char OpFlags =
15119 Subtarget.classifyBlockAddressReference();
15120 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15121 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15123 auto PtrVT = getPointerTy(DAG.getDataLayout());
15124 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15125 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15127 // With PIC, the address is actually $g + Offset.
15128 if (isGlobalRelativeToPICBase(OpFlags)) {
15129 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15130 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15136 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15137 const SDLoc &dl, int64_t Offset,
15138 SelectionDAG &DAG) const {
15139 // Create the TargetGlobalAddress node, folding in the constant
15140 // offset if it is legal.
15141 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15142 CodeModel::Model M = DAG.getTarget().getCodeModel();
15143 auto PtrVT = getPointerTy(DAG.getDataLayout());
15145 if (OpFlags == X86II::MO_NO_FLAG &&
15146 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15147 // A direct static reference to a global.
15148 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15151 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15154 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
15156 // With PIC, the address is actually $g + Offset.
15157 if (isGlobalRelativeToPICBase(OpFlags)) {
15158 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15159 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15162 // For globals that require a load from a stub to get the address, emit the
15164 if (isGlobalStubReference(OpFlags))
15165 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15166 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15168 // If there was a non-zero offset that we didn't fold, create an explicit
15169 // addition for it.
15171 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15172 DAG.getConstant(Offset, dl, PtrVT));
15178 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15179 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15180 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15181 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15185 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15186 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15187 unsigned char OperandFlags, bool LocalDynamic = false) {
15188 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15189 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15191 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15192 GA->getValueType(0),
15196 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15200 SDValue Ops[] = { Chain, TGA, *InFlag };
15201 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15203 SDValue Ops[] = { Chain, TGA };
15204 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15207 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15208 MFI.setAdjustsStack(true);
15209 MFI.setHasCalls(true);
15211 SDValue Flag = Chain.getValue(1);
15212 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15215 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15217 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15220 SDLoc dl(GA); // ? function entry point might be better
15221 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15222 DAG.getNode(X86ISD::GlobalBaseReg,
15223 SDLoc(), PtrVT), InFlag);
15224 InFlag = Chain.getValue(1);
15226 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15229 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15231 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15233 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15234 X86::RAX, X86II::MO_TLSGD);
15237 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15243 // Get the start address of the TLS block for this module.
15244 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15245 .getInfo<X86MachineFunctionInfo>();
15246 MFI->incNumLocalDynamicTLSAccesses();
15250 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15251 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15254 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15255 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15256 InFlag = Chain.getValue(1);
15257 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15258 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15261 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15265 unsigned char OperandFlags = X86II::MO_DTPOFF;
15266 unsigned WrapperKind = X86ISD::Wrapper;
15267 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15268 GA->getValueType(0),
15269 GA->getOffset(), OperandFlags);
15270 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15272 // Add x@dtpoff with the base.
15273 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15276 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15277 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15278 const EVT PtrVT, TLSModel::Model model,
15279 bool is64Bit, bool isPIC) {
15282 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15283 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15284 is64Bit ? 257 : 256));
15286 SDValue ThreadPointer =
15287 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15288 MachinePointerInfo(Ptr));
15290 unsigned char OperandFlags = 0;
15291 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15293 unsigned WrapperKind = X86ISD::Wrapper;
15294 if (model == TLSModel::LocalExec) {
15295 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15296 } else if (model == TLSModel::InitialExec) {
15298 OperandFlags = X86II::MO_GOTTPOFF;
15299 WrapperKind = X86ISD::WrapperRIP;
15301 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15304 llvm_unreachable("Unexpected model");
15307 // emit "addl x@ntpoff,%eax" (local exec)
15308 // or "addl x@indntpoff,%eax" (initial exec)
15309 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15311 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15312 GA->getOffset(), OperandFlags);
15313 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15315 if (model == TLSModel::InitialExec) {
15316 if (isPIC && !is64Bit) {
15317 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15318 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15322 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15323 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15326 // The address of the thread local variable is the add of the thread
15327 // pointer with the offset of the variable.
15328 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15332 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15334 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15336 if (DAG.getTarget().Options.EmulatedTLS)
15337 return LowerToTLSEmulatedModel(GA, DAG);
15339 const GlobalValue *GV = GA->getGlobal();
15340 auto PtrVT = getPointerTy(DAG.getDataLayout());
15341 bool PositionIndependent = isPositionIndependent();
15343 if (Subtarget.isTargetELF()) {
15344 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15346 case TLSModel::GeneralDynamic:
15347 if (Subtarget.is64Bit())
15348 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15349 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
15350 case TLSModel::LocalDynamic:
15351 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
15352 Subtarget.is64Bit());
15353 case TLSModel::InitialExec:
15354 case TLSModel::LocalExec:
15355 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
15356 PositionIndependent);
15358 llvm_unreachable("Unknown TLS model.");
15361 if (Subtarget.isTargetDarwin()) {
15362 // Darwin only has one model of TLS. Lower to that.
15363 unsigned char OpFlag = 0;
15364 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
15365 X86ISD::WrapperRIP : X86ISD::Wrapper;
15367 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15368 // global base reg.
15369 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
15371 OpFlag = X86II::MO_TLVP_PIC_BASE;
15373 OpFlag = X86II::MO_TLVP;
15375 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
15376 GA->getValueType(0),
15377 GA->getOffset(), OpFlag);
15378 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
15380 // With PIC32, the address is actually $g + Offset.
15382 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
15383 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15386 // Lowering the machine isd will make sure everything is in the right
15388 SDValue Chain = DAG.getEntryNode();
15389 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15390 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15391 SDValue Args[] = { Chain, Offset };
15392 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15393 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15394 DAG.getIntPtrConstant(0, DL, true),
15395 Chain.getValue(1), DL);
15397 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15398 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15399 MFI.setAdjustsStack(true);
15401 // And our return value (tls address) is in the standard call return value
15403 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15404 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15407 if (Subtarget.isTargetKnownWindowsMSVC() ||
15408 Subtarget.isTargetWindowsItanium() ||
15409 Subtarget.isTargetWindowsGNU()) {
15410 // Just use the implicit TLS architecture
15411 // Need to generate something similar to:
15412 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15414 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15415 // mov rcx, qword [rdx+rcx*8]
15416 // mov eax, .tls$:tlsvar
15417 // [rax+rcx] contains the address
15418 // Windows 64bit: gs:0x58
15419 // Windows 32bit: fs:__tls_array
15422 SDValue Chain = DAG.getEntryNode();
15424 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15425 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15426 // use its literal value of 0x2C.
15427 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15428 ? Type::getInt8PtrTy(*DAG.getContext(),
15430 : Type::getInt32PtrTy(*DAG.getContext(),
15433 SDValue TlsArray = Subtarget.is64Bit()
15434 ? DAG.getIntPtrConstant(0x58, dl)
15435 : (Subtarget.isTargetWindowsGNU()
15436 ? DAG.getIntPtrConstant(0x2C, dl)
15437 : DAG.getExternalSymbol("_tls_array", PtrVT));
15439 SDValue ThreadPointer =
15440 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15443 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15444 res = ThreadPointer;
15446 // Load the _tls_index variable
15447 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15448 if (Subtarget.is64Bit())
15449 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15450 MachinePointerInfo(), MVT::i32);
15452 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15454 auto &DL = DAG.getDataLayout();
15456 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15457 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15459 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15462 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15464 // Get the offset of start of .tls section
15465 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15466 GA->getValueType(0),
15467 GA->getOffset(), X86II::MO_SECREL);
15468 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15470 // The address of the thread local variable is the add of the thread
15471 // pointer with the offset of the variable.
15472 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15475 llvm_unreachable("TLS not implemented for this target.");
15478 /// Lower SRA_PARTS and friends, which return two i32 values
15479 /// and take a 2 x i32 value to shift plus a shift amount.
15480 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15481 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15482 MVT VT = Op.getSimpleValueType();
15483 unsigned VTBits = VT.getSizeInBits();
15485 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15486 SDValue ShOpLo = Op.getOperand(0);
15487 SDValue ShOpHi = Op.getOperand(1);
15488 SDValue ShAmt = Op.getOperand(2);
15489 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15490 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15492 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15493 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15494 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15495 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15496 : DAG.getConstant(0, dl, VT);
15498 SDValue Tmp2, Tmp3;
15499 if (Op.getOpcode() == ISD::SHL_PARTS) {
15500 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15501 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15503 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15504 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15507 // If the shift amount is larger or equal than the width of a part we can't
15508 // rely on the results of shld/shrd. Insert a test and select the appropriate
15509 // values for large shift amounts.
15510 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15511 DAG.getConstant(VTBits, dl, MVT::i8));
15512 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15513 AndNode, DAG.getConstant(0, dl, MVT::i8));
15516 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15517 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15518 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15520 if (Op.getOpcode() == ISD::SHL_PARTS) {
15521 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15522 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15524 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15525 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15528 SDValue Ops[2] = { Lo, Hi };
15529 return DAG.getMergeValues(Ops, dl);
15532 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15533 SelectionDAG &DAG) const {
15534 SDValue Src = Op.getOperand(0);
15535 MVT SrcVT = Src.getSimpleValueType();
15536 MVT VT = Op.getSimpleValueType();
15539 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15540 if (SrcVT.isVector()) {
15541 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15542 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15543 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15544 DAG.getUNDEF(SrcVT)));
15546 if (SrcVT.getVectorElementType() == MVT::i1) {
15547 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15548 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15549 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15550 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15551 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15552 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15557 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15558 "Unknown SINT_TO_FP to lower!");
15560 // These are really Legal; return the operand so the caller accepts it as
15562 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15564 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15565 Subtarget.is64Bit()) {
15569 SDValue ValueToStore = Op.getOperand(0);
15570 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15571 !Subtarget.is64Bit())
15572 // Bitcasting to f64 here allows us to do a single 64-bit store from
15573 // an SSE register, avoiding the store forwarding penalty that would come
15574 // with two 32-bit stores.
15575 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15577 unsigned Size = SrcVT.getSizeInBits()/8;
15578 MachineFunction &MF = DAG.getMachineFunction();
15579 auto PtrVT = getPointerTy(MF.getDataLayout());
15580 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15581 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15582 SDValue Chain = DAG.getStore(
15583 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15584 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15585 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15588 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15590 SelectionDAG &DAG) const {
15594 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15596 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15598 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15600 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15602 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15603 MachineMemOperand *MMO;
15605 int SSFI = FI->getIndex();
15606 MMO = DAG.getMachineFunction().getMachineMemOperand(
15607 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15608 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15610 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15611 StackSlot = StackSlot.getOperand(1);
15613 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15614 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15616 Tys, Ops, SrcVT, MMO);
15619 Chain = Result.getValue(1);
15620 SDValue InFlag = Result.getValue(2);
15622 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15623 // shouldn't be necessary except that RFP cannot be live across
15624 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15625 MachineFunction &MF = DAG.getMachineFunction();
15626 unsigned SSFISize = Op.getValueSizeInBits()/8;
15627 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15628 auto PtrVT = getPointerTy(MF.getDataLayout());
15629 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15630 Tys = DAG.getVTList(MVT::Other);
15632 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15634 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15635 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15636 MachineMemOperand::MOStore, SSFISize, SSFISize);
15638 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15639 Ops, Op.getValueType(), MMO);
15640 Result = DAG.getLoad(
15641 Op.getValueType(), DL, Chain, StackSlot,
15642 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15648 /// 64-bit unsigned integer to double expansion.
15649 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15650 SelectionDAG &DAG) const {
15651 // This algorithm is not obvious. Here it is what we're trying to output:
15654 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15655 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15657 haddpd %xmm0, %xmm0
15659 pshufd $0x4e, %xmm0, %xmm1
15665 LLVMContext *Context = DAG.getContext();
15667 // Build some magic constants.
15668 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15669 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15670 auto PtrVT = getPointerTy(DAG.getDataLayout());
15671 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15673 SmallVector<Constant*,2> CV1;
15675 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15676 APInt(64, 0x4330000000000000ULL))));
15678 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15679 APInt(64, 0x4530000000000000ULL))));
15680 Constant *C1 = ConstantVector::get(CV1);
15681 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15683 // Load the 64-bit value into an XMM register.
15684 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15687 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15688 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15689 /* Alignment = */ 16);
15691 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15694 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15695 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15696 /* Alignment = */ 16);
15697 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15698 // TODO: Are there any fast-math-flags to propagate here?
15699 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15702 if (Subtarget.hasSSE3()) {
15703 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15704 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15706 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15707 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15708 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15709 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15712 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15713 DAG.getIntPtrConstant(0, dl));
15716 /// 32-bit unsigned integer to float expansion.
15717 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15718 SelectionDAG &DAG) const {
15720 // FP constant to bias correct the final result.
15721 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15724 // Load the 32-bit value into an XMM register.
15725 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15728 // Zero out the upper parts of the register.
15729 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15731 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15732 DAG.getBitcast(MVT::v2f64, Load),
15733 DAG.getIntPtrConstant(0, dl));
15735 // Or the load with the bias.
15736 SDValue Or = DAG.getNode(
15737 ISD::OR, dl, MVT::v2i64,
15738 DAG.getBitcast(MVT::v2i64,
15739 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15740 DAG.getBitcast(MVT::v2i64,
15741 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15743 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15744 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15746 // Subtract the bias.
15747 // TODO: Are there any fast-math-flags to propagate here?
15748 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15750 // Handle final rounding.
15751 MVT DestVT = Op.getSimpleValueType();
15753 if (DestVT.bitsLT(MVT::f64))
15754 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15755 DAG.getIntPtrConstant(0, dl));
15756 if (DestVT.bitsGT(MVT::f64))
15757 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15759 // Handle final rounding.
15763 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15764 const X86Subtarget &Subtarget, SDLoc &DL) {
15765 if (Op.getSimpleValueType() != MVT::v2f64)
15768 SDValue N0 = Op.getOperand(0);
15769 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15771 // Legalize to v4i32 type.
15772 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15773 DAG.getUNDEF(MVT::v2i32));
15775 if (Subtarget.hasAVX512())
15776 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15778 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15779 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15780 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15781 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15783 // Two to the power of half-word-size.
15784 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15786 // Clear upper part of LO, lower HI.
15787 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15788 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15790 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15791 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15792 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15794 // Add the two halves.
15795 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15798 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15799 const X86Subtarget &Subtarget) {
15800 // The algorithm is the following:
15801 // #ifdef __SSE4_1__
15802 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15803 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15804 // (uint4) 0x53000000, 0xaa);
15806 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15807 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15809 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15810 // return (float4) lo + fhi;
15812 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15813 // reassociate the two FADDs, and if we do that, the algorithm fails
15814 // spectacularly (PR24512).
15815 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15816 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15817 // there's also the MachineCombiner reassociations happening on Machine IR.
15818 if (DAG.getTarget().Options.UnsafeFPMath)
15822 SDValue V = Op->getOperand(0);
15823 MVT VecIntVT = V.getSimpleValueType();
15824 bool Is128 = VecIntVT == MVT::v4i32;
15825 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15826 // If we convert to something else than the supported type, e.g., to v4f64,
15828 if (VecFloatVT != Op->getSimpleValueType(0))
15831 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15832 "Unsupported custom type");
15834 // In the #idef/#else code, we have in common:
15835 // - The vector of constants:
15841 // Create the splat vector for 0x4b000000.
15842 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15843 // Create the splat vector for 0x53000000.
15844 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15846 // Create the right shift.
15847 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15848 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15851 if (Subtarget.hasSSE41()) {
15852 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15853 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15854 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15855 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15856 // Low will be bitcasted right away, so do not bother bitcasting back to its
15858 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15859 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15860 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15861 // (uint4) 0x53000000, 0xaa);
15862 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15863 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15864 // High will be bitcasted right away, so do not bother bitcasting back to
15865 // its original type.
15866 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15867 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15869 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15870 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15871 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15872 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15874 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15875 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15878 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15879 SDValue VecCstFAdd = DAG.getConstantFP(
15880 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15882 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15883 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15884 // TODO: Are there any fast-math-flags to propagate here?
15886 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15887 // return (float4) lo + fhi;
15888 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15889 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15892 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15893 SelectionDAG &DAG) const {
15894 SDValue N0 = Op.getOperand(0);
15895 MVT SrcVT = N0.getSimpleValueType();
15898 if (SrcVT.getVectorElementType() == MVT::i1) {
15899 if (SrcVT == MVT::v2i1)
15900 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15901 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15902 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15903 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15904 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15907 switch (SrcVT.SimpleTy) {
15909 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15911 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15914 assert(!Subtarget.hasAVX512());
15915 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15919 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15920 SelectionDAG &DAG) const {
15921 SDValue N0 = Op.getOperand(0);
15923 auto PtrVT = getPointerTy(DAG.getDataLayout());
15925 if (Op.getSimpleValueType().isVector())
15926 return lowerUINT_TO_FP_vec(Op, DAG);
15928 MVT SrcVT = N0.getSimpleValueType();
15929 MVT DstVT = Op.getSimpleValueType();
15931 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15932 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15933 // Conversions from unsigned i32 to f32/f64 are legal,
15934 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15938 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15939 return LowerUINT_TO_FP_i64(Op, DAG);
15940 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15941 return LowerUINT_TO_FP_i32(Op, DAG);
15942 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15945 // Make a 64-bit buffer, and use it to build an FILD.
15946 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15947 if (SrcVT == MVT::i32) {
15948 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15949 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15950 StackSlot, MachinePointerInfo());
15951 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15952 OffsetSlot, MachinePointerInfo());
15953 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15957 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15958 SDValue ValueToStore = Op.getOperand(0);
15959 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15960 // Bitcasting to f64 here allows us to do a single 64-bit store from
15961 // an SSE register, avoiding the store forwarding penalty that would come
15962 // with two 32-bit stores.
15963 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15964 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15965 MachinePointerInfo());
15966 // For i64 source, we need to add the appropriate power of 2 if the input
15967 // was negative. This is the same as the optimization in
15968 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15969 // we must be careful to do the computation in x87 extended precision, not
15970 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15971 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15972 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15973 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15974 MachineMemOperand::MOLoad, 8, 8);
15976 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15977 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15978 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15981 APInt FF(32, 0x5F800000ULL);
15983 // Check whether the sign bit is set.
15984 SDValue SignSet = DAG.getSetCC(
15985 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15986 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15988 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15989 SDValue FudgePtr = DAG.getConstantPool(
15990 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15992 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15993 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15994 SDValue Four = DAG.getIntPtrConstant(4, dl);
15995 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15996 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15998 // Load the value out, extending it from f32 to f80.
15999 // FIXME: Avoid the extend by constructing the right constant pool?
16000 SDValue Fudge = DAG.getExtLoad(
16001 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16002 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16003 /* Alignment = */ 4);
16004 // Extend everything to 80 bits to force it to be done on x87.
16005 // TODO: Are there any fast-math-flags to propagate here?
16006 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16007 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16008 DAG.getIntPtrConstant(0, dl));
16011 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16012 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16013 // just return an <SDValue(), SDValue()> pair.
16014 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16015 // to i16, i32 or i64, and we lower it to a legal sequence.
16016 // If lowered to the final integer result we return a <result, SDValue()> pair.
16017 // Otherwise we lower it to a sequence ending with a FIST, return a
16018 // <FIST, StackSlot> pair, and the caller is responsible for loading
16019 // the final integer result from StackSlot.
16020 std::pair<SDValue,SDValue>
16021 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16022 bool IsSigned, bool IsReplace) const {
16025 EVT DstTy = Op.getValueType();
16026 EVT TheVT = Op.getOperand(0).getValueType();
16027 auto PtrVT = getPointerTy(DAG.getDataLayout());
16029 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16030 // f16 must be promoted before using the lowering in this routine.
16031 // fp128 does not use this lowering.
16032 return std::make_pair(SDValue(), SDValue());
16035 // If using FIST to compute an unsigned i64, we'll need some fixup
16036 // to handle values above the maximum signed i64. A FIST is always
16037 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16038 bool UnsignedFixup = !IsSigned &&
16039 DstTy == MVT::i64 &&
16040 (!Subtarget.is64Bit() ||
16041 !isScalarFPTypeInSSEReg(TheVT));
16043 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16044 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16045 // The low 32 bits of the fist result will have the correct uint32 result.
16046 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16050 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16051 DstTy.getSimpleVT() >= MVT::i16 &&
16052 "Unknown FP_TO_INT to lower!");
16054 // These are really Legal.
16055 if (DstTy == MVT::i32 &&
16056 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16057 return std::make_pair(SDValue(), SDValue());
16058 if (Subtarget.is64Bit() &&
16059 DstTy == MVT::i64 &&
16060 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16061 return std::make_pair(SDValue(), SDValue());
16063 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16065 MachineFunction &MF = DAG.getMachineFunction();
16066 unsigned MemSize = DstTy.getSizeInBits()/8;
16067 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16068 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16071 switch (DstTy.getSimpleVT().SimpleTy) {
16072 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16073 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16074 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16075 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16078 SDValue Chain = DAG.getEntryNode();
16079 SDValue Value = Op.getOperand(0);
16080 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16082 if (UnsignedFixup) {
16084 // Conversion to unsigned i64 is implemented with a select,
16085 // depending on whether the source value fits in the range
16086 // of a signed i64. Let Thresh be the FP equivalent of
16087 // 0x8000000000000000ULL.
16089 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16090 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16091 // Fist-to-mem64 FistSrc
16092 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16093 // to XOR'ing the high 32 bits with Adjust.
16095 // Being a power of 2, Thresh is exactly representable in all FP formats.
16096 // For X87 we'd like to use the smallest FP type for this constant, but
16097 // for DAG type consistency we have to match the FP operand type.
16099 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16100 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16101 bool LosesInfo = false;
16102 if (TheVT == MVT::f64)
16103 // The rounding mode is irrelevant as the conversion should be exact.
16104 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16106 else if (TheVT == MVT::f80)
16107 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16108 APFloat::rmNearestTiesToEven, &LosesInfo);
16110 assert(Status == APFloat::opOK && !LosesInfo &&
16111 "FP conversion should have been exact");
16113 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16115 SDValue Cmp = DAG.getSetCC(DL,
16116 getSetCCResultType(DAG.getDataLayout(),
16117 *DAG.getContext(), TheVT),
16118 Value, ThreshVal, ISD::SETLT);
16119 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16120 DAG.getConstant(0, DL, MVT::i32),
16121 DAG.getConstant(0x80000000, DL, MVT::i32));
16122 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16123 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16124 *DAG.getContext(), TheVT),
16125 Value, ThreshVal, ISD::SETLT);
16126 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16129 // FIXME This causes a redundant load/store if the SSE-class value is already
16130 // in memory, such as if it is on the callstack.
16131 if (isScalarFPTypeInSSEReg(TheVT)) {
16132 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16133 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16134 MachinePointerInfo::getFixedStack(MF, SSFI));
16135 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16137 Chain, StackSlot, DAG.getValueType(TheVT)
16140 MachineMemOperand *MMO =
16141 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16142 MachineMemOperand::MOLoad, MemSize, MemSize);
16143 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16144 Chain = Value.getValue(1);
16145 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16146 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16149 MachineMemOperand *MMO =
16150 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16151 MachineMemOperand::MOStore, MemSize, MemSize);
16153 if (UnsignedFixup) {
16155 // Insert the FIST, load its result as two i32's,
16156 // and XOR the high i32 with Adjust.
16158 SDValue FistOps[] = { Chain, Value, StackSlot };
16159 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16160 FistOps, DstTy, MMO);
16163 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16164 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16167 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16168 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16170 if (Subtarget.is64Bit()) {
16171 // Join High32 and Low32 into a 64-bit result.
16172 // (High32 << 32) | Low32
16173 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16174 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16175 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16176 DAG.getConstant(32, DL, MVT::i8));
16177 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16178 return std::make_pair(Result, SDValue());
16181 SDValue ResultOps[] = { Low32, High32 };
16183 SDValue pair = IsReplace
16184 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16185 : DAG.getMergeValues(ResultOps, DL);
16186 return std::make_pair(pair, SDValue());
16188 // Build the FP_TO_INT*_IN_MEM
16189 SDValue Ops[] = { Chain, Value, StackSlot };
16190 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16192 return std::make_pair(FIST, StackSlot);
16196 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16197 const X86Subtarget &Subtarget) {
16198 MVT VT = Op->getSimpleValueType(0);
16199 SDValue In = Op->getOperand(0);
16200 MVT InVT = In.getSimpleValueType();
16203 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16204 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16205 (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
16206 (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
16207 (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
16208 (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
16209 (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
16210 (VT != MVT::v32i16 || InVT != MVT::v32i8))
16213 if (Subtarget.hasInt256())
16214 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16216 // Optimize vectors in AVX mode:
16219 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16220 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16221 // Concat upper and lower parts.
16224 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16225 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16226 // Concat upper and lower parts.
16229 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16230 SDValue Undef = DAG.getUNDEF(InVT);
16231 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16232 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16233 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16235 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16236 VT.getVectorNumElements()/2);
16238 OpLo = DAG.getBitcast(HVT, OpLo);
16239 OpHi = DAG.getBitcast(HVT, OpHi);
16241 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16244 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16245 const X86Subtarget &Subtarget,
16246 SelectionDAG &DAG) {
16247 MVT VT = Op->getSimpleValueType(0);
16248 SDValue In = Op->getOperand(0);
16249 MVT InVT = In.getSimpleValueType();
16250 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16252 unsigned NumElts = VT.getVectorNumElements();
16254 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
16256 if (!Subtarget.hasBWI() &&
16257 (VT.getVectorElementType().getSizeInBits() <= 16))
16258 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16260 // Widen to 512-bits if VLX is not supported.
16261 MVT WideVT = ExtVT;
16262 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16263 NumElts *= 512 / ExtVT.getSizeInBits();
16264 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16265 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16266 In, DAG.getIntPtrConstant(0, DL));
16267 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16271 SDValue One = DAG.getConstant(1, DL, WideVT);
16272 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16274 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
16276 // Truncate if we had to extend i16/i8 above.
16278 WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16279 SelectedVal = DAG.getNode(X86ISD::VTRUNC, DL, WideVT, SelectedVal);
16282 // Extract back to 128/256-bit if we widened.
16284 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
16285 DAG.getIntPtrConstant(0, DL));
16287 return SelectedVal;
16290 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16291 SelectionDAG &DAG) {
16292 SDValue In = Op->getOperand(0);
16293 MVT InVT = In.getSimpleValueType();
16295 if (InVT.getVectorElementType() == MVT::i1)
16296 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16298 if (Subtarget.hasFp256())
16299 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
16305 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16306 SelectionDAG &DAG) {
16307 SDValue In = Op.getOperand(0);
16308 MVT SVT = In.getSimpleValueType();
16310 if (SVT.getVectorElementType() == MVT::i1)
16311 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16313 if (Subtarget.hasFp256())
16314 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
16317 assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||
16318 Op.getSimpleValueType().getVectorNumElements() !=
16319 SVT.getVectorNumElements());
16323 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16324 /// It makes use of the fact that vectors with enough leading sign/zero bits
16325 /// prevent the PACKSS/PACKUS from saturating the results.
16326 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16327 /// within each 128-bit lane.
16328 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16329 const SDLoc &DL, SelectionDAG &DAG,
16330 const X86Subtarget &Subtarget) {
16331 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
16332 "Unexpected PACK opcode");
16334 // Requires SSE2 but AVX512 has fast truncate.
16335 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
16338 EVT SrcVT = In.getValueType();
16340 // No truncation required, we might get here due to recursive calls.
16341 if (SrcVT == DstVT)
16344 // We only support vector truncation to 128bits or greater from a
16345 // 256bits or greater source.
16346 unsigned DstSizeInBits = DstVT.getSizeInBits();
16347 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
16348 if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
16351 LLVMContext &Ctx = *DAG.getContext();
16352 unsigned NumElems = SrcVT.getVectorNumElements();
16353 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
16354 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
16356 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
16358 // Extract lower/upper subvectors.
16359 unsigned NumSubElts = NumElems / 2;
16360 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16361 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16363 // Pack to the largest type possible:
16364 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
16365 EVT InVT = MVT::i16, OutVT = MVT::i8;
16366 if (DstVT.getScalarSizeInBits() > 8 &&
16367 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
16372 unsigned SubSizeInBits = SrcSizeInBits / 2;
16373 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
16374 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
16376 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
16377 if (SrcVT.is256BitVector()) {
16378 Lo = DAG.getBitcast(InVT, Lo);
16379 Hi = DAG.getBitcast(InVT, Hi);
16380 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16381 return DAG.getBitcast(DstVT, Res);
16384 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
16385 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
16386 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
16387 Lo = DAG.getBitcast(InVT, Lo);
16388 Hi = DAG.getBitcast(InVT, Hi);
16389 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16391 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
16392 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
16393 Res = DAG.getBitcast(MVT::v4i64, Res);
16394 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
16396 if (DstVT.is256BitVector())
16397 return DAG.getBitcast(DstVT, Res);
16399 // If 512bit -> 128bit truncate another stage.
16400 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16401 Res = DAG.getBitcast(PackedVT, Res);
16402 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16405 // Recursively pack lower/upper subvectors, concat result and pack again.
16406 assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
16407 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
16408 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
16409 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
16411 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16412 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16413 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16416 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16417 const X86Subtarget &Subtarget) {
16420 MVT VT = Op.getSimpleValueType();
16421 SDValue In = Op.getOperand(0);
16422 MVT InVT = In.getSimpleValueType();
16424 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16426 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16427 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16428 if (InVT.getScalarSizeInBits() <= 16) {
16429 if (Subtarget.hasBWI()) {
16430 // legal, will go to VPMOVB2M, VPMOVW2M
16431 // Shift packed bytes not supported natively, bitcast to word
16432 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16433 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
16434 DAG.getBitcast(ExtVT, In),
16435 DAG.getConstant(ShiftInx, DL, ExtVT));
16436 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
16437 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
16439 // Use TESTD/Q, extended vector to packed dword/qword.
16440 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16441 "Unexpected vector type.");
16442 unsigned NumElts = InVT.getVectorNumElements();
16443 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16444 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16446 ShiftInx = InVT.getScalarSizeInBits() - 1;
16449 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
16450 DAG.getConstant(ShiftInx, DL, InVT));
16451 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
16454 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16456 MVT VT = Op.getSimpleValueType();
16457 SDValue In = Op.getOperand(0);
16458 MVT InVT = In.getSimpleValueType();
16459 unsigned InNumEltBits = InVT.getScalarSizeInBits();
16461 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16462 "Invalid TRUNCATE operation");
16464 if (VT.getVectorElementType() == MVT::i1)
16465 return LowerTruncateVecI1(Op, DAG, Subtarget);
16467 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16468 if (Subtarget.hasAVX512()) {
16469 // word to byte only under BWI
16470 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16471 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16472 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16473 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16476 // Truncate with PACKSS if we are truncating a vector with sign-bits that
16477 // extend all the way to the packed/truncated value.
16478 unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
16479 if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
16481 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
16484 // Truncate with PACKUS if we are truncating a vector with leading zero bits
16485 // that extend all the way to the packed/truncated value.
16486 // Pre-SSE41 we can only use PACKUSWB.
16488 DAG.computeKnownBits(In, Known);
16489 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
16490 if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
16492 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
16495 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16496 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16497 if (Subtarget.hasInt256()) {
16498 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16499 In = DAG.getBitcast(MVT::v8i32, In);
16500 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16501 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16502 DAG.getIntPtrConstant(0, DL));
16505 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16506 DAG.getIntPtrConstant(0, DL));
16507 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16508 DAG.getIntPtrConstant(2, DL));
16509 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16510 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16511 static const int ShufMask[] = {0, 2, 4, 6};
16512 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16515 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16516 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16517 if (Subtarget.hasInt256()) {
16518 In = DAG.getBitcast(MVT::v32i8, In);
16520 // The PSHUFB mask:
16521 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16522 -1, -1, -1, -1, -1, -1, -1, -1,
16523 16, 17, 20, 21, 24, 25, 28, 29,
16524 -1, -1, -1, -1, -1, -1, -1, -1 };
16525 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16526 In = DAG.getBitcast(MVT::v4i64, In);
16528 static const int ShufMask2[] = {0, 2, -1, -1};
16529 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16530 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16531 DAG.getIntPtrConstant(0, DL));
16532 return DAG.getBitcast(VT, In);
16535 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16536 DAG.getIntPtrConstant(0, DL));
16538 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16539 DAG.getIntPtrConstant(4, DL));
16541 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16542 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16544 // The PSHUFB mask:
16545 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16546 -1, -1, -1, -1, -1, -1, -1, -1};
16548 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16549 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16551 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16552 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16554 // The MOVLHPS Mask:
16555 static const int ShufMask2[] = {0, 1, 4, 5};
16556 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16557 return DAG.getBitcast(MVT::v8i16, res);
16560 // Handle truncation of V256 to V128 using shuffles.
16561 if (!VT.is128BitVector() || !InVT.is256BitVector())
16564 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16566 unsigned NumElems = VT.getVectorNumElements();
16567 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16569 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16570 // Prepare truncation shuffle mask
16571 for (unsigned i = 0; i != NumElems; ++i)
16572 MaskVec[i] = i * 2;
16573 In = DAG.getBitcast(NVT, In);
16574 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16575 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16576 DAG.getIntPtrConstant(0, DL));
16579 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16580 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16581 MVT VT = Op.getSimpleValueType();
16583 if (VT.isVector()) {
16584 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16585 SDValue Src = Op.getOperand(0);
16587 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16588 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16589 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16590 DAG.getUNDEF(MVT::v2f32)));
16596 assert(!VT.isVector());
16598 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16599 IsSigned, /*IsReplace=*/ false);
16600 SDValue FIST = Vals.first, StackSlot = Vals.second;
16601 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16602 if (!FIST.getNode())
16605 if (StackSlot.getNode())
16606 // Load the result.
16607 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16609 // The node is the result.
16613 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16615 MVT VT = Op.getSimpleValueType();
16616 SDValue In = Op.getOperand(0);
16617 MVT SVT = In.getSimpleValueType();
16619 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16621 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16622 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16623 In, DAG.getUNDEF(SVT)));
16626 /// The only differences between FABS and FNEG are the mask and the logic op.
16627 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16628 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16629 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16630 "Wrong opcode for lowering FABS or FNEG.");
16632 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16634 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16635 // into an FNABS. We'll lower the FABS after that if it is still in use.
16637 for (SDNode *User : Op->uses())
16638 if (User->getOpcode() == ISD::FNEG)
16642 MVT VT = Op.getSimpleValueType();
16644 bool IsF128 = (VT == MVT::f128);
16646 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16647 // decide if we should generate a 16-byte constant mask when we only need 4 or
16648 // 8 bytes for the scalar case.
16653 if (VT.isVector()) {
16655 EltVT = VT.getVectorElementType();
16656 } else if (IsF128) {
16657 // SSE instructions are used for optimized f128 logical operations.
16658 LogicVT = MVT::f128;
16661 // There are no scalar bitwise logical SSE/AVX instructions, so we
16662 // generate a 16-byte vector constant and logic op even for the scalar case.
16663 // Using a 16-byte mask allows folding the load of the mask with
16664 // the logic op, so it can save (~4 bytes) on code size.
16665 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16669 unsigned EltBits = EltVT.getSizeInBits();
16670 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16672 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16673 const fltSemantics &Sem =
16674 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16675 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16676 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16678 SDValue Op0 = Op.getOperand(0);
16679 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16681 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16682 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16684 if (VT.isVector() || IsF128)
16685 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16687 // For the scalar case extend to a 128-bit vector, perform the logic op,
16688 // and extract the scalar result back out.
16689 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16690 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16691 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16692 DAG.getIntPtrConstant(0, dl));
16695 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16696 SDValue Mag = Op.getOperand(0);
16697 SDValue Sign = Op.getOperand(1);
16700 // If the sign operand is smaller, extend it first.
16701 MVT VT = Op.getSimpleValueType();
16702 if (Sign.getSimpleValueType().bitsLT(VT))
16703 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16705 // And if it is bigger, shrink it first.
16706 if (Sign.getSimpleValueType().bitsGT(VT))
16707 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16709 // At this point the operands and the result should have the same
16710 // type, and that won't be f80 since that is not custom lowered.
16711 bool IsF128 = (VT == MVT::f128);
16712 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16713 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16714 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16715 "Unexpected type in LowerFCOPYSIGN");
16717 MVT EltVT = VT.getScalarType();
16718 const fltSemantics &Sem =
16719 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16720 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16722 // Perform all scalar logic operations as 16-byte vectors because there are no
16723 // scalar FP logic instructions in SSE.
16724 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16725 // unnecessary splats, but we might miss load folding opportunities. Should
16726 // this decision be based on OptimizeForSize?
16727 bool IsFakeVector = !VT.isVector() && !IsF128;
16730 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16732 // The mask constants are automatically splatted for vector types.
16733 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16734 SDValue SignMask = DAG.getConstantFP(
16735 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16736 SDValue MagMask = DAG.getConstantFP(
16737 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16739 // First, clear all bits but the sign bit from the second operand (sign).
16741 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16742 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16744 // Next, clear the sign bit from the first operand (magnitude).
16745 // TODO: If we had general constant folding for FP logic ops, this check
16746 // wouldn't be necessary.
16748 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16749 APFloat APF = Op0CN->getValueAPF();
16751 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16753 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16755 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16756 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16759 // OR the magnitude value with the sign bit.
16760 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16761 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16762 DAG.getIntPtrConstant(0, dl));
16765 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16766 SDValue N0 = Op.getOperand(0);
16768 MVT VT = Op.getSimpleValueType();
16770 MVT OpVT = N0.getSimpleValueType();
16771 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16772 "Unexpected type for FGETSIGN");
16774 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16775 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16776 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16777 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16778 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16779 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16783 // Check whether an OR'd tree is PTEST-able.
16784 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16785 SelectionDAG &DAG) {
16786 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16788 if (!Subtarget.hasSSE41())
16791 if (!Op->hasOneUse())
16794 SDNode *N = Op.getNode();
16797 SmallVector<SDValue, 8> Opnds;
16798 DenseMap<SDValue, unsigned> VecInMap;
16799 SmallVector<SDValue, 8> VecIns;
16800 EVT VT = MVT::Other;
16802 // Recognize a special case where a vector is casted into wide integer to
16804 Opnds.push_back(N->getOperand(0));
16805 Opnds.push_back(N->getOperand(1));
16807 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16808 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16809 // BFS traverse all OR'd operands.
16810 if (I->getOpcode() == ISD::OR) {
16811 Opnds.push_back(I->getOperand(0));
16812 Opnds.push_back(I->getOperand(1));
16813 // Re-evaluate the number of nodes to be traversed.
16814 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16818 // Quit if a non-EXTRACT_VECTOR_ELT
16819 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16822 // Quit if without a constant index.
16823 SDValue Idx = I->getOperand(1);
16824 if (!isa<ConstantSDNode>(Idx))
16827 SDValue ExtractedFromVec = I->getOperand(0);
16828 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16829 if (M == VecInMap.end()) {
16830 VT = ExtractedFromVec.getValueType();
16831 // Quit if not 128/256-bit vector.
16832 if (!VT.is128BitVector() && !VT.is256BitVector())
16834 // Quit if not the same type.
16835 if (VecInMap.begin() != VecInMap.end() &&
16836 VT != VecInMap.begin()->first.getValueType())
16838 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16839 VecIns.push_back(ExtractedFromVec);
16841 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16844 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16845 "Not extracted from 128-/256-bit vector.");
16847 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16849 for (DenseMap<SDValue, unsigned>::const_iterator
16850 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16851 // Quit if not all elements are used.
16852 if (I->second != FullMask)
16856 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16858 // Cast all vectors into TestVT for PTEST.
16859 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16860 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16862 // If more than one full vector is evaluated, OR them first before PTEST.
16863 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16864 // Each iteration will OR 2 nodes and append the result until there is only
16865 // 1 node left, i.e. the final OR'd value of all vectors.
16866 SDValue LHS = VecIns[Slot];
16867 SDValue RHS = VecIns[Slot + 1];
16868 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16871 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16874 /// \brief return true if \c Op has a use that doesn't just read flags.
16875 static bool hasNonFlagsUse(SDValue Op) {
16876 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16878 SDNode *User = *UI;
16879 unsigned UOpNo = UI.getOperandNo();
16880 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16881 // Look pass truncate.
16882 UOpNo = User->use_begin().getOperandNo();
16883 User = *User->use_begin();
16886 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16887 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16893 // Emit KTEST instruction for bit vectors on AVX-512
16894 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16895 const X86Subtarget &Subtarget) {
16896 if (Op.getOpcode() == ISD::BITCAST) {
16897 auto hasKTEST = [&](MVT VT) {
16898 unsigned SizeInBits = VT.getSizeInBits();
16899 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16900 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16902 SDValue Op0 = Op.getOperand(0);
16903 MVT Op0VT = Op0.getValueType().getSimpleVT();
16904 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16906 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16911 /// Emit nodes that will be selected as "test Op0,Op0", or something
16913 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16914 SelectionDAG &DAG) const {
16915 if (Op.getValueType() == MVT::i1) {
16916 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16917 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16918 DAG.getConstant(0, dl, MVT::i8));
16920 // CF and OF aren't always set the way we want. Determine which
16921 // of these we need.
16922 bool NeedCF = false;
16923 bool NeedOF = false;
16926 case X86::COND_A: case X86::COND_AE:
16927 case X86::COND_B: case X86::COND_BE:
16930 case X86::COND_G: case X86::COND_GE:
16931 case X86::COND_L: case X86::COND_LE:
16932 case X86::COND_O: case X86::COND_NO: {
16933 // Check if we really need to set the
16934 // Overflow flag. If NoSignedWrap is present
16935 // that is not actually needed.
16936 switch (Op->getOpcode()) {
16941 if (Op.getNode()->getFlags().hasNoSignedWrap())
16951 // See if we can use the EFLAGS value from the operand instead of
16952 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16953 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16954 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16955 // Emit KTEST for bit vectors
16956 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16958 // Emit a CMP with 0, which is the TEST pattern.
16959 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16960 DAG.getConstant(0, dl, Op.getValueType()));
16962 unsigned Opcode = 0;
16963 unsigned NumOperands = 0;
16965 // Truncate operations may prevent the merge of the SETCC instruction
16966 // and the arithmetic instruction before it. Attempt to truncate the operands
16967 // of the arithmetic instruction and use a reduced bit-width instruction.
16968 bool NeedTruncation = false;
16969 SDValue ArithOp = Op;
16970 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16971 SDValue Arith = Op->getOperand(0);
16972 // Both the trunc and the arithmetic op need to have one user each.
16973 if (Arith->hasOneUse())
16974 switch (Arith.getOpcode()) {
16981 NeedTruncation = true;
16987 // Sometimes flags can be set either with an AND or with an SRL/SHL
16988 // instruction. SRL/SHL variant should be preferred for masks longer than this
16990 const int ShiftToAndMaxMaskWidth = 32;
16991 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16993 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16994 // which may be the result of a CAST. We use the variable 'Op', which is the
16995 // non-casted variable when we check for possible users.
16996 switch (ArithOp.getOpcode()) {
16998 // We only want to rewrite this as a target-specific node with attached
16999 // flags if there is a reasonable chance of either using that to do custom
17000 // instructions selection that can fold some of the memory operands, or if
17001 // only the flags are used. If there are other uses, leave the node alone
17002 // and emit a test instruction.
17003 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17004 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17005 if (UI->getOpcode() != ISD::CopyToReg &&
17006 UI->getOpcode() != ISD::SETCC &&
17007 UI->getOpcode() != ISD::STORE)
17010 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17011 // An add of one will be selected as an INC.
17013 (!Subtarget.slowIncDec() ||
17014 DAG.getMachineFunction().getFunction().optForSize())) {
17015 Opcode = X86ISD::INC;
17020 // An add of negative one (subtract of one) will be selected as a DEC.
17021 if (C->isAllOnesValue() &&
17022 (!Subtarget.slowIncDec() ||
17023 DAG.getMachineFunction().getFunction().optForSize())) {
17024 Opcode = X86ISD::DEC;
17030 // Otherwise use a regular EFLAGS-setting add.
17031 Opcode = X86ISD::ADD;
17036 // If we have a constant logical shift that's only used in a comparison
17037 // against zero turn it into an equivalent AND. This allows turning it into
17038 // a TEST instruction later.
17039 if (ZeroCheck && Op->hasOneUse() &&
17040 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17041 EVT VT = Op.getValueType();
17042 unsigned BitWidth = VT.getSizeInBits();
17043 unsigned ShAmt = Op->getConstantOperandVal(1);
17044 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17046 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17047 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17048 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17049 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17051 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17052 DAG.getConstant(Mask, dl, VT));
17057 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17058 // because a TEST instruction will be better. However, AND should be
17059 // preferred if the instruction can be combined into ANDN.
17060 if (!hasNonFlagsUse(Op)) {
17061 SDValue Op0 = ArithOp->getOperand(0);
17062 SDValue Op1 = ArithOp->getOperand(1);
17063 EVT VT = ArithOp.getValueType();
17064 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17065 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17066 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17068 // If we cannot select an ANDN instruction, check if we can replace
17069 // AND+IMM64 with a shift before giving up. This is possible for masks
17070 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17071 if (!isProperAndn) {
17075 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17076 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17080 const APInt &Mask = CN->getAPIntValue();
17081 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17082 break; // Prefer TEST instruction.
17084 unsigned BitWidth = Mask.getBitWidth();
17085 unsigned LeadingOnes = Mask.countLeadingOnes();
17086 unsigned TrailingZeros = Mask.countTrailingZeros();
17088 if (LeadingOnes + TrailingZeros == BitWidth) {
17089 assert(TrailingZeros < VT.getSizeInBits() &&
17090 "Shift amount should be less than the type width");
17091 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17092 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17093 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17097 unsigned LeadingZeros = Mask.countLeadingZeros();
17098 unsigned TrailingOnes = Mask.countTrailingOnes();
17100 if (LeadingZeros + TrailingOnes == BitWidth) {
17101 assert(LeadingZeros < VT.getSizeInBits() &&
17102 "Shift amount should be less than the type width");
17103 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17104 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17105 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17116 // Similar to ISD::ADD above, check if the uses will preclude useful
17117 // lowering of the target-specific node.
17118 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17119 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17120 if (UI->getOpcode() != ISD::CopyToReg &&
17121 UI->getOpcode() != ISD::SETCC &&
17122 UI->getOpcode() != ISD::STORE)
17125 // Otherwise use a regular EFLAGS-setting instruction.
17126 switch (ArithOp.getOpcode()) {
17127 default: llvm_unreachable("unexpected operator!");
17128 case ISD::SUB: Opcode = X86ISD::SUB; break;
17129 case ISD::XOR: Opcode = X86ISD::XOR; break;
17130 case ISD::AND: Opcode = X86ISD::AND; break;
17132 if (!NeedTruncation && ZeroCheck) {
17133 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
17136 Opcode = X86ISD::OR;
17150 return SDValue(Op.getNode(), 1);
17156 // If we found that truncation is beneficial, perform the truncation and
17158 if (NeedTruncation) {
17159 EVT VT = Op.getValueType();
17160 SDValue WideVal = Op->getOperand(0);
17161 EVT WideVT = WideVal.getValueType();
17162 unsigned ConvertedOp = 0;
17163 // Use a target machine opcode to prevent further DAGCombine
17164 // optimizations that may separate the arithmetic operations
17165 // from the setcc node.
17166 switch (WideVal.getOpcode()) {
17168 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17169 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17170 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17171 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17172 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17176 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17177 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17178 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17179 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17180 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
17186 // Emit KTEST for bit vectors
17187 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
17190 // Emit a CMP with 0, which is the TEST pattern.
17191 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17192 DAG.getConstant(0, dl, Op.getValueType()));
17194 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17195 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17197 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17198 DAG.ReplaceAllUsesWith(Op, New);
17199 return SDValue(New.getNode(), 1);
17202 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17204 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17205 const SDLoc &dl, SelectionDAG &DAG) const {
17206 if (isNullConstant(Op1))
17207 return EmitTest(Op0, X86CC, dl, DAG);
17209 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17210 "Unexpected comparison operation for MVT::i1 operands");
17212 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17213 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17214 // Only promote the compare up to I32 if it is a 16 bit operation
17215 // with an immediate. 16 bit immediates are to be avoided.
17216 if ((Op0.getValueType() == MVT::i16 &&
17217 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17218 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17219 !Subtarget.isAtom()) {
17220 unsigned ExtendOp =
17221 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17222 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17223 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17225 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17226 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17227 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17228 return SDValue(Sub.getNode(), 1);
17230 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17233 /// Convert a comparison if required by the subtarget.
17234 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17235 SelectionDAG &DAG) const {
17236 // If the subtarget does not support the FUCOMI instruction, floating-point
17237 // comparisons have to be converted.
17238 if (Subtarget.hasCMov() ||
17239 Cmp.getOpcode() != X86ISD::CMP ||
17240 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17241 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17244 // The instruction selector will select an FUCOM instruction instead of
17245 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17246 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17247 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17249 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17250 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17251 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17252 DAG.getConstant(8, dl, MVT::i8));
17253 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17255 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17256 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
17257 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17260 /// Check if replacement of SQRT with RSQRT should be disabled.
17261 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17262 EVT VT = Op.getValueType();
17264 // We never want to use both SQRT and RSQRT instructions for the same input.
17265 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17269 return Subtarget.hasFastVectorFSQRT();
17270 return Subtarget.hasFastScalarFSQRT();
17273 /// The minimum architected relative accuracy is 2^-12. We need one
17274 /// Newton-Raphson step to have a good float result (24 bits of precision).
17275 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17276 SelectionDAG &DAG, int Enabled,
17277 int &RefinementSteps,
17278 bool &UseOneConstNR,
17279 bool Reciprocal) const {
17280 EVT VT = Op.getValueType();
17282 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17283 // TODO: Add support for AVX512 (v16f32).
17284 // It is likely not profitable to do this for f64 because a double-precision
17285 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17286 // instructions: convert to single, rsqrtss, convert back to double, refine
17287 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17288 // along with FMA, this could be a throughput win.
17289 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17290 // after legalize types.
17291 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17292 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
17293 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17294 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17295 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17296 RefinementSteps = 1;
17298 UseOneConstNR = false;
17299 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
17304 /// The minimum architected relative accuracy is 2^-12. We need one
17305 /// Newton-Raphson step to have a good float result (24 bits of precision).
17306 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
17308 int &RefinementSteps) const {
17309 EVT VT = Op.getValueType();
17311 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17312 // TODO: Add support for AVX512 (v16f32).
17313 // It is likely not profitable to do this for f64 because a double-precision
17314 // reciprocal estimate with refinement on x86 prior to FMA requires
17315 // 15 instructions: convert to single, rcpss, convert back to double, refine
17316 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
17317 // along with FMA, this could be a throughput win.
17319 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17320 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
17321 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17322 // Enable estimate codegen with 1 refinement step for vector division.
17323 // Scalar division estimates are disabled because they break too much
17324 // real-world code. These defaults are intended to match GCC behavior.
17325 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
17328 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17329 RefinementSteps = 1;
17331 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
17336 /// If we have at least two divisions that use the same divisor, convert to
17337 /// multiplication by a reciprocal. This may need to be adjusted for a given
17338 /// CPU if a division's cost is not at least twice the cost of a multiplication.
17339 /// This is because we still need one division to calculate the reciprocal and
17340 /// then we need two multiplies by that reciprocal as replacements for the
17341 /// original divisions.
17342 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
17346 /// Helper for creating a X86ISD::SETCC node.
17347 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17348 SelectionDAG &DAG) {
17349 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17350 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17353 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
17354 /// according to equal/not-equal condition code \p CC.
17355 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
17356 const SDLoc &dl, SelectionDAG &DAG) {
17357 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
17358 // instruction. Since the shift amount is in-range-or-undefined, we know
17359 // that doing a bittest on the i32 value is ok. We extend to i32 because
17360 // the encoding for the i16 version is larger than the i32 version.
17361 // Also promote i16 to i32 for performance / code size reason.
17362 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
17363 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
17365 // See if we can use the 32-bit instruction instead of the 64-bit one for a
17366 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
17367 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
17368 // known to be zero.
17369 if (Src.getValueType() == MVT::i64 &&
17370 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
17371 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
17373 // If the operand types disagree, extend the shift amount to match. Since
17374 // BT ignores high bits (like shifts) we can use anyextend.
17375 if (Src.getValueType() != BitNo.getValueType())
17376 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
17378 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
17379 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
17380 return getSETCC(Cond, BT, dl , DAG);
17383 /// Result of 'and' is compared against zero. Change to a BT node if possible.
17384 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
17385 const SDLoc &dl, SelectionDAG &DAG) {
17386 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
17387 SDValue Op0 = And.getOperand(0);
17388 SDValue Op1 = And.getOperand(1);
17389 if (Op0.getOpcode() == ISD::TRUNCATE)
17390 Op0 = Op0.getOperand(0);
17391 if (Op1.getOpcode() == ISD::TRUNCATE)
17392 Op1 = Op1.getOperand(0);
17395 if (Op1.getOpcode() == ISD::SHL)
17396 std::swap(Op0, Op1);
17397 if (Op0.getOpcode() == ISD::SHL) {
17398 if (isOneConstant(Op0.getOperand(0))) {
17399 // If we looked past a truncate, check that it's only truncating away
17401 unsigned BitWidth = Op0.getValueSizeInBits();
17402 unsigned AndBitWidth = And.getValueSizeInBits();
17403 if (BitWidth > AndBitWidth) {
17405 DAG.computeKnownBits(Op0, Known);
17406 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
17410 RHS = Op0.getOperand(1);
17412 } else if (Op1.getOpcode() == ISD::Constant) {
17413 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
17414 uint64_t AndRHSVal = AndRHS->getZExtValue();
17415 SDValue AndLHS = Op0;
17417 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
17418 LHS = AndLHS.getOperand(0);
17419 RHS = AndLHS.getOperand(1);
17422 // Use BT if the immediate can't be encoded in a TEST instruction.
17423 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
17425 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17430 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17435 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17437 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17442 // SSE Condition code mapping:
17451 switch (SetCCOpcode) {
17452 default: llvm_unreachable("Unexpected SETCC condition");
17454 case ISD::SETEQ: SSECC = 0; break;
17456 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17458 case ISD::SETOLT: SSECC = 1; break;
17460 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17462 case ISD::SETOLE: SSECC = 2; break;
17463 case ISD::SETUO: SSECC = 3; break;
17465 case ISD::SETNE: SSECC = 4; break;
17466 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17467 case ISD::SETUGE: SSECC = 5; break;
17468 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17469 case ISD::SETUGT: SSECC = 6; break;
17470 case ISD::SETO: SSECC = 7; break;
17471 case ISD::SETUEQ: SSECC = 8; break;
17472 case ISD::SETONE: SSECC = 12; break;
17475 std::swap(Op0, Op1);
17480 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17481 /// concatenate the result back.
17482 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17483 MVT VT = Op.getSimpleValueType();
17485 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17486 "Unsupported value type for operation");
17488 unsigned NumElems = VT.getVectorNumElements();
17490 SDValue CC = Op.getOperand(2);
17492 // Extract the LHS vectors
17493 SDValue LHS = Op.getOperand(0);
17494 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17495 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17497 // Extract the RHS vectors
17498 SDValue RHS = Op.getOperand(1);
17499 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17500 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17502 // Issue the operation on the smaller types and concatenate the result back
17503 MVT EltVT = VT.getVectorElementType();
17504 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17505 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17506 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17507 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17510 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17511 SDValue Op0 = Op.getOperand(0);
17512 SDValue Op1 = Op.getOperand(1);
17513 SDValue CC = Op.getOperand(2);
17514 MVT VT = Op.getSimpleValueType();
17517 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17518 "Unexpected type for boolean compare operation");
17519 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17520 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17521 DAG.getConstant(-1, dl, VT));
17522 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17523 DAG.getConstant(-1, dl, VT));
17524 switch (SetCCOpcode) {
17525 default: llvm_unreachable("Unexpected SETCC condition");
17527 // (x == y) -> ~(x ^ y)
17528 return DAG.getNode(ISD::XOR, dl, VT,
17529 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17530 DAG.getConstant(-1, dl, VT));
17532 // (x != y) -> (x ^ y)
17533 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17536 // (x > y) -> (x & ~y)
17537 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17540 // (x < y) -> (~x & y)
17541 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17544 // (x <= y) -> (~x | y)
17545 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17548 // (x >=y) -> (x | ~y)
17549 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17553 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17555 SDValue Op0 = Op.getOperand(0);
17556 SDValue Op1 = Op.getOperand(1);
17557 SDValue CC = Op.getOperand(2);
17558 MVT VT = Op.getSimpleValueType();
17561 assert(VT.getVectorElementType() == MVT::i1 &&
17562 "Cannot set masked compare for this operation");
17564 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17566 bool Unsigned = false;
17569 switch (SetCCOpcode) {
17570 default: llvm_unreachable("Unexpected SETCC condition");
17571 case ISD::SETNE: SSECC = 4; break;
17572 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17573 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17574 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17575 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17576 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17577 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17578 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17579 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17580 case ISD::SETLE: SSECC = 2; break;
17584 std::swap(Op0, Op1);
17586 // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
17587 if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
17588 SDValue A = peekThroughBitcasts(Op0);
17589 if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
17590 ISD::isBuildVectorAllZeros(Op1.getNode())) {
17591 MVT VT0 = Op0.getSimpleValueType();
17592 SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
17593 SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
17594 return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
17600 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17601 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17602 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17603 DAG.getConstant(SSECC, dl, MVT::i8));
17606 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17607 /// operand \p Op1. If non-trivial (for example because it's not constant)
17608 /// return an empty value.
17609 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17610 SelectionDAG &DAG) {
17611 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17615 MVT VT = Op1.getSimpleValueType();
17616 MVT EVT = VT.getVectorElementType();
17617 unsigned n = VT.getVectorNumElements();
17618 SmallVector<SDValue, 8> ULTOp1;
17620 for (unsigned i = 0; i < n; ++i) {
17621 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17622 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17625 // Avoid underflow.
17626 APInt Val = Elt->getAPIntValue();
17630 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17633 return DAG.getBuildVector(VT, dl, ULTOp1);
17636 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17637 SelectionDAG &DAG) {
17638 SDValue Op0 = Op.getOperand(0);
17639 SDValue Op1 = Op.getOperand(1);
17640 SDValue CC = Op.getOperand(2);
17641 MVT VT = Op.getSimpleValueType();
17642 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17643 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17648 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17649 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17653 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17654 assert(VT.getVectorNumElements() <= 16);
17655 Opc = X86ISD::CMPM;
17657 Opc = X86ISD::CMPP;
17658 // The SSE/AVX packed FP comparison nodes are defined with a
17659 // floating-point vector result that matches the operand type. This allows
17660 // them to work with an SSE1 target (integer vector types are not legal).
17661 VT = Op0.getSimpleValueType();
17664 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17665 // emit two comparisons and a logic op to tie them together.
17667 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17668 if (SSECC >= 8 && !Subtarget.hasAVX()) {
17669 // LLVM predicate is SETUEQ or SETONE.
17671 unsigned CombineOpc;
17672 if (Cond == ISD::SETUEQ) {
17675 CombineOpc = X86ISD::FOR;
17677 assert(Cond == ISD::SETONE);
17680 CombineOpc = X86ISD::FAND;
17683 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17684 DAG.getConstant(CC0, dl, MVT::i8));
17685 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17686 DAG.getConstant(CC1, dl, MVT::i8));
17687 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17689 // Handle all other FP comparisons here.
17690 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17691 DAG.getConstant(SSECC, dl, MVT::i8));
17694 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17695 // result type of SETCC. The bitcast is expected to be optimized away
17696 // during combining/isel.
17697 if (Opc == X86ISD::CMPP)
17698 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17703 MVT VTOp0 = Op0.getSimpleValueType();
17704 assert(VTOp0 == Op1.getSimpleValueType() &&
17705 "Expected operands with same type!");
17706 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17707 "Invalid number of packed elements for source and destination!");
17709 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17710 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17711 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17712 // legalizer firstly checks if the first operand in input to the setcc has
17713 // a legal type. If so, then it promotes the return type to that same type.
17714 // Otherwise, the return type is promoted to the 'next legal type' which,
17715 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17717 // We reach this code only if the following two conditions are met:
17718 // 1. Both return type and operand type have been promoted to wider types
17719 // by the type legalizer.
17720 // 2. The original operand type has been promoted to a 256-bit vector.
17722 // Note that condition 2. only applies for AVX targets.
17723 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17724 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17727 // The non-AVX512 code below works under the assumption that source and
17728 // destination types are the same.
17729 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17730 "Value types for source and destination must be the same!");
17732 // Break 256-bit integer vector compare into smaller ones.
17733 if (VT.is256BitVector() && !Subtarget.hasInt256())
17734 return Lower256IntVSETCC(Op, DAG);
17736 // Operands are boolean (vectors of i1)
17737 MVT OpVT = Op1.getSimpleValueType();
17738 if (OpVT.getVectorElementType() == MVT::i1)
17739 return LowerBoolVSETCC_AVX512(Op, DAG);
17741 // The result is boolean, but operands are int/float
17742 if (VT.getVectorElementType() == MVT::i1) {
17743 // In AVX-512 architecture setcc returns mask with i1 elements,
17744 // But there is no compare instruction for i8 and i16 elements in KNL.
17745 // In this case use SSE compare
17746 bool UseAVX512Inst =
17747 (OpVT.is512BitVector() ||
17748 OpVT.getScalarSizeInBits() >= 32 ||
17749 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17752 return LowerIntVSETCC_AVX512(Op, DAG);
17754 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17755 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17758 // Lower using XOP integer comparisons.
17759 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17760 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17761 // Translate compare code to XOP PCOM compare mode.
17762 unsigned CmpMode = 0;
17764 default: llvm_unreachable("Unexpected SETCC condition");
17766 case ISD::SETLT: CmpMode = 0x00; break;
17768 case ISD::SETLE: CmpMode = 0x01; break;
17770 case ISD::SETGT: CmpMode = 0x02; break;
17772 case ISD::SETGE: CmpMode = 0x03; break;
17773 case ISD::SETEQ: CmpMode = 0x04; break;
17774 case ISD::SETNE: CmpMode = 0x05; break;
17777 // Are we comparing unsigned or signed integers?
17779 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17781 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17782 DAG.getConstant(CmpMode, dl, MVT::i8));
17785 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
17786 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
17787 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
17788 SDValue BC0 = peekThroughBitcasts(Op0);
17789 if (BC0.getOpcode() == ISD::AND) {
17791 SmallVector<APInt, 64> EltBits;
17792 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
17793 VT.getScalarSizeInBits(), UndefElts,
17794 EltBits, false, false)) {
17795 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
17797 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
17803 // We are handling one of the integer comparisons here. Since SSE only has
17804 // GT and EQ comparisons for integer, swapping operands and multiple
17805 // operations may be required for some comparisons.
17806 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17808 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17809 Cond == ISD::SETGE || Cond == ISD::SETUGE;
17810 bool Invert = Cond == ISD::SETNE ||
17811 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17813 // If both operands are known non-negative, then an unsigned compare is the
17814 // same as a signed compare and there's no need to flip signbits.
17815 // TODO: We could check for more general simplifications here since we're
17816 // computing known bits.
17817 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17818 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17820 // Special case: Use min/max operations for SETULE/SETUGE
17821 MVT VET = VT.getVectorElementType();
17823 (Subtarget.hasAVX512() && VET == MVT::i64) ||
17824 (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
17825 (Subtarget.hasSSE2() && (VET == MVT::i8));
17826 bool MinMax = false;
17830 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17831 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17835 Swap = Invert = FlipSigns = false;
17838 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17839 bool Subus = false;
17840 if (!MinMax && HasSubus) {
17841 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17843 // t = psubus Op0, Op1
17844 // pcmpeq t, <0..0>
17847 case ISD::SETULT: {
17848 // If the comparison is against a constant we can turn this into a
17849 // setule. With psubus, setule does not require a swap. This is
17850 // beneficial because the constant in the register is no longer
17851 // destructed as the destination so it can be hoisted out of a loop.
17852 // Only do this pre-AVX since vpcmp* is no longer destructive.
17853 if (Subtarget.hasAVX())
17855 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17857 Subus = true; Invert = false; Swap = false;
17861 // Psubus is better than flip-sign because it requires no inversion.
17862 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17863 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17867 Opc = X86ISD::SUBUS;
17873 std::swap(Op0, Op1);
17875 // Check that the operation in question is available (most are plain SSE2,
17876 // but PCMPGTQ and PCMPEQQ have different requirements).
17877 if (VT == MVT::v2i64) {
17878 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17879 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17881 // First cast everything to the right type.
17882 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17883 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17885 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17886 // bits of the inputs before performing those operations. The lower
17887 // compare is always unsigned.
17890 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17892 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17893 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17894 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17896 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17897 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17899 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17900 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17901 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17903 // Create masks for only the low parts/high parts of the 64 bit integers.
17904 static const int MaskHi[] = { 1, 1, 3, 3 };
17905 static const int MaskLo[] = { 0, 0, 2, 2 };
17906 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17907 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17908 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17910 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17911 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17914 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17916 return DAG.getBitcast(VT, Result);
17919 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17920 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17921 // pcmpeqd + pshufd + pand.
17922 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17924 // First cast everything to the right type.
17925 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17926 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17929 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17931 // Make sure the lower and upper halves are both all-ones.
17932 static const int Mask[] = { 1, 0, 3, 2 };
17933 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17934 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17937 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17939 return DAG.getBitcast(VT, Result);
17943 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17944 // bits of the inputs before performing those operations.
17946 MVT EltVT = VT.getVectorElementType();
17947 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17949 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17950 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17953 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17955 // If the logical-not of the result is required, perform that now.
17957 Result = DAG.getNOT(dl, Result, VT);
17960 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17963 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17964 getZeroVector(VT, Subtarget, DAG, dl));
17969 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17971 MVT VT = Op.getSimpleValueType();
17973 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17975 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17976 SDValue Op0 = Op.getOperand(0);
17977 SDValue Op1 = Op.getOperand(1);
17979 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17981 // Optimize to BT if possible.
17982 // Lower (X & (1 << N)) == 0 to BT(X, N).
17983 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17984 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17985 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
17986 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17987 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
17991 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17993 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17994 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17996 // If the input is a setcc, then reuse the input setcc or use a new one with
17997 // the inverted condition.
17998 if (Op0.getOpcode() == X86ISD::SETCC) {
17999 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18000 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18004 CCode = X86::GetOppositeBranchCondition(CCode);
18005 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18009 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18010 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18011 if (X86CC == X86::COND_INVALID)
18014 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18015 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18016 return getSETCC(X86CC, EFLAGS, dl, DAG);
18019 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18020 SDValue LHS = Op.getOperand(0);
18021 SDValue RHS = Op.getOperand(1);
18022 SDValue Carry = Op.getOperand(2);
18023 SDValue Cond = Op.getOperand(3);
18026 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18027 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18029 // Recreate the carry if needed.
18030 EVT CarryVT = Carry.getValueType();
18031 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18032 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18033 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18035 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18036 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18037 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18040 /// Return true if opcode is a X86 logical comparison.
18041 static bool isX86LogicalCmp(SDValue Op) {
18042 unsigned Opc = Op.getOpcode();
18043 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18044 Opc == X86ISD::SAHF)
18046 if (Op.getResNo() == 1 &&
18047 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18048 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18049 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18050 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18053 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18059 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18060 if (V.getOpcode() != ISD::TRUNCATE)
18063 SDValue VOp0 = V.getOperand(0);
18064 unsigned InBits = VOp0.getValueSizeInBits();
18065 unsigned Bits = V.getValueSizeInBits();
18066 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18069 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18070 bool AddTest = true;
18071 SDValue Cond = Op.getOperand(0);
18072 SDValue Op1 = Op.getOperand(1);
18073 SDValue Op2 = Op.getOperand(2);
18075 MVT VT = Op1.getSimpleValueType();
18078 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18079 // are available or VBLENDV if AVX is available.
18080 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18081 if (Cond.getOpcode() == ISD::SETCC &&
18082 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
18083 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18084 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18085 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18086 unsigned SSECC = translateX86FSETCC(
18087 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18089 if (Subtarget.hasAVX512()) {
18090 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18091 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18092 assert(!VT.isVector() && "Not a scalar type?");
18093 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18096 if (SSECC < 8 || Subtarget.hasAVX()) {
18097 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18098 DAG.getConstant(SSECC, DL, MVT::i8));
18100 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18101 // of 3 logic instructions for size savings and potentially speed.
18102 // Unfortunately, there is no scalar form of VBLENDV.
18104 // If either operand is a constant, don't try this. We can expect to
18105 // optimize away at least one of the logic instructions later in that
18106 // case, so that sequence would be faster than a variable blend.
18108 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18109 // uses XMM0 as the selection register. That may need just as many
18110 // instructions as the AND/ANDN/OR sequence due to register moves, so
18113 if (Subtarget.hasAVX() &&
18114 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18116 // Convert to vectors, do a VSELECT, and convert back to scalar.
18117 // All of the conversions should be optimized away.
18119 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18120 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18121 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18122 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18124 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18125 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18127 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18129 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18130 VSel, DAG.getIntPtrConstant(0, DL));
18132 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18133 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18134 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18138 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18139 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18140 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18141 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18144 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18146 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18147 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18148 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18149 Op1Scalar = Op1.getOperand(0);
18151 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18152 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18153 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18154 Op2Scalar = Op2.getOperand(0);
18155 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18156 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18157 Op1Scalar, Op2Scalar);
18158 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18159 return DAG.getBitcast(VT, newSelect);
18160 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18161 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18162 DAG.getIntPtrConstant(0, DL));
18166 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18167 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18168 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18169 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18170 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18171 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18172 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18173 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18176 if (Cond.getOpcode() == ISD::SETCC) {
18177 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18179 // If the condition was updated, it's possible that the operands of the
18180 // select were also updated (for example, EmitTest has a RAUW). Refresh
18181 // the local references to the select operands in case they got stale.
18182 Op1 = Op.getOperand(1);
18183 Op2 = Op.getOperand(2);
18187 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18188 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18189 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18190 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18191 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18192 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18193 if (Cond.getOpcode() == X86ISD::SETCC &&
18194 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18195 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18196 SDValue Cmp = Cond.getOperand(1);
18197 unsigned CondCode =
18198 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18200 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18201 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18202 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18203 SDValue CmpOp0 = Cmp.getOperand(0);
18205 // Apply further optimizations for special cases
18206 // (select (x != 0), -1, 0) -> neg & sbb
18207 // (select (x == 0), 0, -1) -> neg & sbb
18208 if (isNullConstant(Y) &&
18209 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18210 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18211 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18212 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18213 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18214 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18215 SDValue(Neg.getNode(), 1));
18219 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18220 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18221 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18223 SDValue Res = // Res = 0 or -1.
18224 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18225 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18227 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18228 Res = DAG.getNOT(DL, Res, Res.getValueType());
18230 if (!isNullConstant(Op2))
18231 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18233 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18234 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18235 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18236 SDValue CmpOp0 = Cmp.getOperand(0);
18237 SDValue Src1, Src2;
18238 // true if Op2 is XOR or OR operator and one of its operands
18240 // ( a , a op b) || ( b , a op b)
18241 auto isOrXorPattern = [&]() {
18242 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18243 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18245 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18252 if (isOrXorPattern()) {
18254 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18255 // we need mask of all zeros or ones with same size of the other
18257 if (CmpSz > VT.getSizeInBits())
18258 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18259 else if (CmpSz < VT.getSizeInBits())
18260 Neg = DAG.getNode(ISD::AND, DL, VT,
18261 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18262 DAG.getConstant(1, DL, VT));
18265 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18266 Neg); // -(and (x, 0x1))
18267 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18268 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18273 // Look past (and (setcc_carry (cmp ...)), 1).
18274 if (Cond.getOpcode() == ISD::AND &&
18275 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18276 isOneConstant(Cond.getOperand(1)))
18277 Cond = Cond.getOperand(0);
18279 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18280 // setting operand in place of the X86ISD::SETCC.
18281 unsigned CondOpcode = Cond.getOpcode();
18282 if (CondOpcode == X86ISD::SETCC ||
18283 CondOpcode == X86ISD::SETCC_CARRY) {
18284 CC = Cond.getOperand(0);
18286 SDValue Cmp = Cond.getOperand(1);
18287 unsigned Opc = Cmp.getOpcode();
18288 MVT VT = Op.getSimpleValueType();
18290 bool IllegalFPCMov = false;
18291 if (VT.isFloatingPoint() && !VT.isVector() &&
18292 !isScalarFPTypeInSSEReg(VT)) // FPStack?
18293 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
18295 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
18296 Opc == X86ISD::BT) { // FIXME
18300 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18301 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18302 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18303 Cond.getOperand(0).getValueType() != MVT::i8)) {
18304 SDValue LHS = Cond.getOperand(0);
18305 SDValue RHS = Cond.getOperand(1);
18306 unsigned X86Opcode;
18309 switch (CondOpcode) {
18310 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18311 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18312 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18313 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18314 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18315 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18316 default: llvm_unreachable("unexpected overflowing operator");
18318 if (CondOpcode == ISD::UMULO)
18319 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18322 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18324 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
18326 if (CondOpcode == ISD::UMULO)
18327 Cond = X86Op.getValue(2);
18329 Cond = X86Op.getValue(1);
18331 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
18336 // Look past the truncate if the high bits are known zero.
18337 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18338 Cond = Cond.getOperand(0);
18340 // We know the result of AND is compared against zero. Try to match
18342 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
18343 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
18344 CC = NewSetCC.getOperand(0);
18345 Cond = NewSetCC.getOperand(1);
18352 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
18353 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
18356 // a < b ? -1 : 0 -> RES = ~setcc_carry
18357 // a < b ? 0 : -1 -> RES = setcc_carry
18358 // a >= b ? -1 : 0 -> RES = setcc_carry
18359 // a >= b ? 0 : -1 -> RES = ~setcc_carry
18360 if (Cond.getOpcode() == X86ISD::SUB) {
18361 Cond = ConvertCmpIfNecessary(Cond, DAG);
18362 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
18364 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
18365 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18366 (isNullConstant(Op1) || isNullConstant(Op2))) {
18367 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18368 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18370 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
18371 return DAG.getNOT(DL, Res, Res.getValueType());
18376 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
18377 // widen the cmov and push the truncate through. This avoids introducing a new
18378 // branch during isel and doesn't add any extensions.
18379 if (Op.getValueType() == MVT::i8 &&
18380 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
18381 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
18382 if (T1.getValueType() == T2.getValueType() &&
18383 // Blacklist CopyFromReg to avoid partial register stalls.
18384 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
18385 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
18387 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
18391 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
18392 // condition is true.
18393 SDValue Ops[] = { Op2, Op1, CC, Cond };
18394 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
18397 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
18398 const X86Subtarget &Subtarget,
18399 SelectionDAG &DAG) {
18400 MVT VT = Op->getSimpleValueType(0);
18401 SDValue In = Op->getOperand(0);
18402 MVT InVT = In.getSimpleValueType();
18403 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
18404 MVT VTElt = VT.getVectorElementType();
18407 unsigned NumElts = VT.getVectorNumElements();
18409 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
18411 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
18412 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
18414 // Widen to 512-bits if VLX is not supported.
18415 MVT WideVT = ExtVT;
18416 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
18417 NumElts *= 512 / ExtVT.getSizeInBits();
18418 InVT = MVT::getVectorVT(MVT::i1, NumElts);
18419 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
18420 In, DAG.getIntPtrConstant(0, dl));
18421 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
18425 MVT WideEltVT = WideVT.getVectorElementType();
18426 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
18427 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
18428 V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
18430 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
18431 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
18432 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
18435 // Truncate if we had to extend i16/i8 above.
18437 WideVT = MVT::getVectorVT(VTElt, NumElts);
18438 V = DAG.getNode(X86ISD::VTRUNC, dl, WideVT, V);
18441 // Extract back to 128/256-bit if we widened.
18443 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
18444 DAG.getIntPtrConstant(0, dl));
18449 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18450 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18451 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18452 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18453 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18454 const X86Subtarget &Subtarget,
18455 SelectionDAG &DAG) {
18456 SDValue In = Op->getOperand(0);
18457 MVT VT = Op->getSimpleValueType(0);
18458 MVT InVT = In.getSimpleValueType();
18459 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18461 MVT SVT = VT.getVectorElementType();
18462 MVT InSVT = InVT.getVectorElementType();
18463 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18465 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18467 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18469 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18470 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18471 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18476 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18477 // For 512-bit vectors, we need 128-bits or 256-bits.
18478 if (VT.getSizeInBits() > 128) {
18479 // Input needs to be at least the same number of elements as output, and
18480 // at least 128-bits.
18481 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18482 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18485 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18486 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18488 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18489 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18490 // need to be handled here for 256/512-bit results.
18491 if (Subtarget.hasInt256()) {
18492 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18493 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18494 X86ISD::VSEXT : X86ISD::VZEXT;
18495 return DAG.getNode(ExtOpc, dl, VT, In);
18498 // We should only get here for sign extend.
18499 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18500 "Unexpected opcode!");
18502 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18506 // As SRAI is only available on i16/i32 types, we expand only up to i32
18507 // and handle i64 separately.
18508 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18509 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18510 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18511 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18512 Curr = DAG.getBitcast(CurrVT, Curr);
18515 SDValue SignExt = Curr;
18516 if (CurrVT != InVT) {
18517 unsigned SignExtShift =
18518 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18519 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18520 DAG.getConstant(SignExtShift, dl, MVT::i8));
18526 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18527 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18528 DAG.getConstant(31, dl, MVT::i8));
18529 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18530 return DAG.getBitcast(VT, Ext);
18536 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18537 SelectionDAG &DAG) {
18538 MVT VT = Op->getSimpleValueType(0);
18539 SDValue In = Op->getOperand(0);
18540 MVT InVT = In.getSimpleValueType();
18543 if (InVT.getVectorElementType() == MVT::i1)
18544 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18546 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18547 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18548 (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
18549 (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
18550 (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
18551 (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
18552 (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
18553 (VT != MVT::v32i16 || InVT != MVT::v32i8))
18556 if (Subtarget.hasInt256())
18557 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18559 // Optimize vectors in AVX mode
18560 // Sign extend v8i16 to v8i32 and
18563 // Divide input vector into two parts
18564 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18565 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18566 // concat the vectors to original VT
18568 unsigned NumElems = InVT.getVectorNumElements();
18569 SDValue Undef = DAG.getUNDEF(InVT);
18571 SmallVector<int,8> ShufMask1(NumElems, -1);
18572 for (unsigned i = 0; i != NumElems/2; ++i)
18575 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18577 SmallVector<int,8> ShufMask2(NumElems, -1);
18578 for (unsigned i = 0; i != NumElems/2; ++i)
18579 ShufMask2[i] = i + NumElems/2;
18581 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18583 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18584 VT.getVectorNumElements() / 2);
18586 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18587 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18589 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18592 // Lower truncating store. We need a special lowering to vXi1 vectors
18593 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18594 SelectionDAG &DAG) {
18595 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18597 EVT MemVT = St->getMemoryVT();
18598 assert(St->isTruncatingStore() && "We only custom truncating store.");
18599 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18600 "Expected truncstore of i1 vector");
18602 SDValue Op = St->getValue();
18603 MVT OpVT = Op.getValueType().getSimpleVT();
18604 unsigned NumElts = OpVT.getVectorNumElements();
18605 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18607 // Truncate and store - everything is legal
18608 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18609 if (MemVT.getSizeInBits() < 8)
18610 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18611 DAG.getUNDEF(MVT::v8i1), Op,
18612 DAG.getIntPtrConstant(0, dl));
18613 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18614 St->getMemOperand());
18617 // A subset, assume that we have only AVX-512F
18618 if (NumElts <= 8) {
18620 // Extend to 8-elts vector
18621 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18622 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18623 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18625 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18626 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18627 St->getMemOperand());
18630 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18631 // Divide the vector into 2 parts and store each part separately
18632 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18633 DAG.getIntPtrConstant(0, dl));
18634 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18635 SDValue BasePtr = St->getBasePtr();
18636 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18637 St->getMemOperand());
18638 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18639 DAG.getIntPtrConstant(16, dl));
18640 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18642 SDValue BasePtrHi =
18643 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18644 DAG.getConstant(2, dl, BasePtr.getValueType()));
18646 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18647 BasePtrHi, St->getMemOperand());
18648 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18651 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18652 const X86Subtarget &Subtarget,
18653 SelectionDAG &DAG) {
18655 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18657 EVT MemVT = Ld->getMemoryVT();
18658 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18659 "Expected i1 vector load");
18660 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18661 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18662 MVT VT = Op.getValueType().getSimpleVT();
18663 unsigned NumElts = VT.getVectorNumElements();
18665 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18666 (Subtarget.hasDQI() && NumElts < 16) ||
18668 // Load and extend - everything is legal
18670 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18672 Ld->getMemOperand());
18673 // Replace chain users with the new chain.
18674 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18675 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18676 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18677 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18679 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18680 DAG.getIntPtrConstant(0, dl));
18682 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18684 Ld->getMemOperand());
18685 // Replace chain users with the new chain.
18686 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18687 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18689 // Finally, do a normal sign-extend to the desired register.
18690 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18693 if (NumElts <= 8) {
18694 // A subset, assume that we have only AVX-512F
18695 unsigned NumBitsToLoad = 8;
18696 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18697 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18699 Ld->getMemOperand());
18700 // Replace chain users with the new chain.
18701 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18702 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18704 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18705 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18708 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18710 // we should take care to v4i1 and v2i1
18712 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18713 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18714 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18715 DAG.getIntPtrConstant(0, dl));
18718 assert(VT == MVT::v32i8 && "Unexpected extload type");
18720 SDValue BasePtr = Ld->getBasePtr();
18721 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18723 Ld->getMemOperand());
18725 SDValue BasePtrHi =
18726 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18727 DAG.getConstant(2, dl, BasePtr.getValueType()));
18729 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18731 Ld->getMemOperand());
18733 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18734 LoadLo.getValue(1), LoadHi.getValue(1));
18735 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18737 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18738 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18739 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18742 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18743 // may emit an illegal shuffle but the expansion is still better than scalar
18744 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18745 // we'll emit a shuffle and a arithmetic shift.
18746 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18747 // TODO: It is possible to support ZExt by zeroing the undef values during
18748 // the shuffle phase or after the shuffle.
18749 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18750 SelectionDAG &DAG) {
18751 MVT RegVT = Op.getSimpleValueType();
18752 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18753 assert(RegVT.isInteger() &&
18754 "We only custom lower integer vector sext loads.");
18756 // Nothing useful we can do without SSE2 shuffles.
18757 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18759 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18761 EVT MemVT = Ld->getMemoryVT();
18762 if (MemVT.getScalarType() == MVT::i1)
18763 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18765 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18766 unsigned RegSz = RegVT.getSizeInBits();
18768 ISD::LoadExtType Ext = Ld->getExtensionType();
18770 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18771 && "Only anyext and sext are currently implemented.");
18772 assert(MemVT != RegVT && "Cannot extend to the same type");
18773 assert(MemVT.isVector() && "Must load a vector from memory");
18775 unsigned NumElems = RegVT.getVectorNumElements();
18776 unsigned MemSz = MemVT.getSizeInBits();
18777 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18779 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18780 // The only way in which we have a legal 256-bit vector result but not the
18781 // integer 256-bit operations needed to directly lower a sextload is if we
18782 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18783 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18784 // correctly legalized. We do this late to allow the canonical form of
18785 // sextload to persist throughout the rest of the DAG combiner -- it wants
18786 // to fold together any extensions it can, and so will fuse a sign_extend
18787 // of an sextload into a sextload targeting a wider value.
18789 if (MemSz == 128) {
18790 // Just switch this to a normal load.
18791 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18792 "it must be a legal 128-bit vector "
18794 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18795 Ld->getPointerInfo(), Ld->getAlignment(),
18796 Ld->getMemOperand()->getFlags());
18798 assert(MemSz < 128 &&
18799 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18800 // Do an sext load to a 128-bit vector type. We want to use the same
18801 // number of elements, but elements half as wide. This will end up being
18802 // recursively lowered by this routine, but will succeed as we definitely
18803 // have all the necessary features if we're using AVX1.
18805 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18806 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18808 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18809 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18810 Ld->getMemOperand()->getFlags());
18813 // Replace chain users with the new chain.
18814 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18815 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18817 // Finally, do a normal sign-extend to the desired register.
18818 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18821 // All sizes must be a power of two.
18822 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18823 "Non-power-of-two elements are not custom lowered!");
18825 // Attempt to load the original value using scalar loads.
18826 // Find the largest scalar type that divides the total loaded size.
18827 MVT SclrLoadTy = MVT::i8;
18828 for (MVT Tp : MVT::integer_valuetypes()) {
18829 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18834 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18835 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18837 SclrLoadTy = MVT::f64;
18839 // Calculate the number of scalar loads that we need to perform
18840 // in order to load our vector from memory.
18841 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18843 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18844 "Can only lower sext loads with a single scalar load!");
18846 unsigned loadRegZize = RegSz;
18847 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18850 // If we don't have BWI we won't be able to create the shuffle needed for
18852 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18853 MemVT == MVT::v8i8)
18856 // Represent our vector as a sequence of elements which are the
18857 // largest scalar that we can load.
18858 EVT LoadUnitVecVT = EVT::getVectorVT(
18859 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18861 // Represent the data using the same element type that is stored in
18862 // memory. In practice, we ''widen'' MemVT.
18864 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18865 loadRegZize / MemVT.getScalarSizeInBits());
18867 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18868 "Invalid vector type");
18870 // We can't shuffle using an illegal type.
18871 assert(TLI.isTypeLegal(WideVecVT) &&
18872 "We only lower types that form legal widened vector types");
18874 SmallVector<SDValue, 8> Chains;
18875 SDValue Ptr = Ld->getBasePtr();
18876 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18877 TLI.getPointerTy(DAG.getDataLayout()));
18878 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18880 for (unsigned i = 0; i < NumLoads; ++i) {
18881 // Perform a single load.
18882 SDValue ScalarLoad =
18883 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18884 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18885 Chains.push_back(ScalarLoad.getValue(1));
18886 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18887 // another round of DAGCombining.
18889 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18891 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18892 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18894 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18897 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18899 // Bitcast the loaded value to a vector of the original element type, in
18900 // the size of the target vector type.
18901 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18902 unsigned SizeRatio = RegSz / MemSz;
18904 if (Ext == ISD::SEXTLOAD) {
18905 // If we have SSE4.1, we can directly emit a VSEXT node.
18906 if (Subtarget.hasSSE41()) {
18907 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18908 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18912 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18914 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18915 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18917 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18918 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18922 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18923 MemVT == MVT::v8i8) {
18924 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
18925 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18929 // Redistribute the loaded elements into the different locations.
18930 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18931 for (unsigned i = 0; i != NumElems; ++i)
18932 ShuffleVec[i * SizeRatio] = i;
18934 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18935 DAG.getUNDEF(WideVecVT), ShuffleVec);
18937 // Bitcast to the requested type.
18938 Shuff = DAG.getBitcast(RegVT, Shuff);
18939 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18943 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18944 /// each of which has no other use apart from the AND / OR.
18945 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18946 Opc = Op.getOpcode();
18947 if (Opc != ISD::OR && Opc != ISD::AND)
18949 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18950 Op.getOperand(0).hasOneUse() &&
18951 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18952 Op.getOperand(1).hasOneUse());
18955 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18956 /// SETCC node has a single use.
18957 static bool isXor1OfSetCC(SDValue Op) {
18958 if (Op.getOpcode() != ISD::XOR)
18960 if (isOneConstant(Op.getOperand(1)))
18961 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18962 Op.getOperand(0).hasOneUse();
18966 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18967 bool addTest = true;
18968 SDValue Chain = Op.getOperand(0);
18969 SDValue Cond = Op.getOperand(1);
18970 SDValue Dest = Op.getOperand(2);
18973 bool Inverted = false;
18975 if (Cond.getOpcode() == ISD::SETCC) {
18976 // Check for setcc([su]{add,sub,mul}o == 0).
18977 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18978 isNullConstant(Cond.getOperand(1)) &&
18979 Cond.getOperand(0).getResNo() == 1 &&
18980 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18981 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18982 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18983 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18984 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18985 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18987 Cond = Cond.getOperand(0);
18989 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18994 // FIXME: LowerXALUO doesn't handle these!!
18995 else if (Cond.getOpcode() == X86ISD::ADD ||
18996 Cond.getOpcode() == X86ISD::SUB ||
18997 Cond.getOpcode() == X86ISD::SMUL ||
18998 Cond.getOpcode() == X86ISD::UMUL)
18999 Cond = LowerXALUO(Cond, DAG);
19002 // Look pass (and (setcc_carry (cmp ...)), 1).
19003 if (Cond.getOpcode() == ISD::AND &&
19004 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19005 isOneConstant(Cond.getOperand(1)))
19006 Cond = Cond.getOperand(0);
19008 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19009 // setting operand in place of the X86ISD::SETCC.
19010 unsigned CondOpcode = Cond.getOpcode();
19011 if (CondOpcode == X86ISD::SETCC ||
19012 CondOpcode == X86ISD::SETCC_CARRY) {
19013 CC = Cond.getOperand(0);
19015 SDValue Cmp = Cond.getOperand(1);
19016 unsigned Opc = Cmp.getOpcode();
19017 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19018 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19022 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19026 // These can only come from an arithmetic instruction with overflow,
19027 // e.g. SADDO, UADDO.
19028 Cond = Cond.getOperand(1);
19034 CondOpcode = Cond.getOpcode();
19035 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19036 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19037 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19038 Cond.getOperand(0).getValueType() != MVT::i8)) {
19039 SDValue LHS = Cond.getOperand(0);
19040 SDValue RHS = Cond.getOperand(1);
19041 unsigned X86Opcode;
19044 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19045 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19047 switch (CondOpcode) {
19048 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19050 if (isOneConstant(RHS)) {
19051 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19054 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19055 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19057 if (isOneConstant(RHS)) {
19058 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19061 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19062 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19063 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19064 default: llvm_unreachable("unexpected overflowing operator");
19067 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19068 if (CondOpcode == ISD::UMULO)
19069 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19072 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19074 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19076 if (CondOpcode == ISD::UMULO)
19077 Cond = X86Op.getValue(2);
19079 Cond = X86Op.getValue(1);
19081 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19085 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19086 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19087 if (CondOpc == ISD::OR) {
19088 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19089 // two branches instead of an explicit OR instruction with a
19091 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19092 isX86LogicalCmp(Cmp)) {
19093 CC = Cond.getOperand(0).getOperand(0);
19094 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19095 Chain, Dest, CC, Cmp);
19096 CC = Cond.getOperand(1).getOperand(0);
19100 } else { // ISD::AND
19101 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19102 // two branches instead of an explicit AND instruction with a
19103 // separate test. However, we only do this if this block doesn't
19104 // have a fall-through edge, because this requires an explicit
19105 // jmp when the condition is false.
19106 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19107 isX86LogicalCmp(Cmp) &&
19108 Op.getNode()->hasOneUse()) {
19109 X86::CondCode CCode =
19110 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19111 CCode = X86::GetOppositeBranchCondition(CCode);
19112 CC = DAG.getConstant(CCode, dl, MVT::i8);
19113 SDNode *User = *Op.getNode()->use_begin();
19114 // Look for an unconditional branch following this conditional branch.
19115 // We need this because we need to reverse the successors in order
19116 // to implement FCMP_OEQ.
19117 if (User->getOpcode() == ISD::BR) {
19118 SDValue FalseBB = User->getOperand(1);
19120 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19121 assert(NewBR == User);
19125 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19126 Chain, Dest, CC, Cmp);
19127 X86::CondCode CCode =
19128 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19129 CCode = X86::GetOppositeBranchCondition(CCode);
19130 CC = DAG.getConstant(CCode, dl, MVT::i8);
19136 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19137 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19138 // It should be transformed during dag combiner except when the condition
19139 // is set by a arithmetics with overflow node.
19140 X86::CondCode CCode =
19141 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19142 CCode = X86::GetOppositeBranchCondition(CCode);
19143 CC = DAG.getConstant(CCode, dl, MVT::i8);
19144 Cond = Cond.getOperand(0).getOperand(1);
19146 } else if (Cond.getOpcode() == ISD::SETCC &&
19147 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19148 // For FCMP_OEQ, we can emit
19149 // two branches instead of an explicit AND instruction with a
19150 // separate test. However, we only do this if this block doesn't
19151 // have a fall-through edge, because this requires an explicit
19152 // jmp when the condition is false.
19153 if (Op.getNode()->hasOneUse()) {
19154 SDNode *User = *Op.getNode()->use_begin();
19155 // Look for an unconditional branch following this conditional branch.
19156 // We need this because we need to reverse the successors in order
19157 // to implement FCMP_OEQ.
19158 if (User->getOpcode() == ISD::BR) {
19159 SDValue FalseBB = User->getOperand(1);
19161 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19162 assert(NewBR == User);
19166 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19167 Cond.getOperand(0), Cond.getOperand(1));
19168 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19169 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19170 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19171 Chain, Dest, CC, Cmp);
19172 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19177 } else if (Cond.getOpcode() == ISD::SETCC &&
19178 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19179 // For FCMP_UNE, we can emit
19180 // two branches instead of an explicit AND instruction with a
19181 // separate test. However, we only do this if this block doesn't
19182 // have a fall-through edge, because this requires an explicit
19183 // jmp when the condition is false.
19184 if (Op.getNode()->hasOneUse()) {
19185 SDNode *User = *Op.getNode()->use_begin();
19186 // Look for an unconditional branch following this conditional branch.
19187 // We need this because we need to reverse the successors in order
19188 // to implement FCMP_UNE.
19189 if (User->getOpcode() == ISD::BR) {
19190 SDValue FalseBB = User->getOperand(1);
19192 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19193 assert(NewBR == User);
19196 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19197 Cond.getOperand(0), Cond.getOperand(1));
19198 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19199 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19200 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19201 Chain, Dest, CC, Cmp);
19202 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19212 // Look pass the truncate if the high bits are known zero.
19213 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19214 Cond = Cond.getOperand(0);
19216 // We know the result of AND is compared against zero. Try to match
19218 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19219 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19220 CC = NewSetCC.getOperand(0);
19221 Cond = NewSetCC.getOperand(1);
19228 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19229 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19230 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19232 Cond = ConvertCmpIfNecessary(Cond, DAG);
19233 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19234 Chain, Dest, CC, Cond);
19237 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19238 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19239 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19240 // that the guard pages used by the OS virtual memory manager are allocated in
19241 // correct sequence.
19243 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19244 SelectionDAG &DAG) const {
19245 MachineFunction &MF = DAG.getMachineFunction();
19246 bool SplitStack = MF.shouldSplitStack();
19247 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19248 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19249 SplitStack || EmitStackProbe;
19253 SDNode *Node = Op.getNode();
19254 SDValue Chain = Op.getOperand(0);
19255 SDValue Size = Op.getOperand(1);
19256 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19257 EVT VT = Node->getValueType(0);
19259 // Chain the dynamic stack allocation so that it doesn't modify the stack
19260 // pointer when other instructions are using the stack.
19261 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19263 bool Is64Bit = Subtarget.is64Bit();
19264 MVT SPTy = getPointerTy(DAG.getDataLayout());
19268 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19269 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19270 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19271 " not tell us which reg is the stack pointer!");
19273 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19274 Chain = SP.getValue(1);
19275 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19276 unsigned StackAlign = TFI.getStackAlignment();
19277 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19278 if (Align > StackAlign)
19279 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19280 DAG.getConstant(-(uint64_t)Align, dl, VT));
19281 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19282 } else if (SplitStack) {
19283 MachineRegisterInfo &MRI = MF.getRegInfo();
19286 // The 64 bit implementation of segmented stacks needs to clobber both r10
19287 // r11. This makes it impossible to use it along with nested parameters.
19288 const Function &F = MF.getFunction();
19289 for (const auto &A : F.args()) {
19290 if (A.hasNestAttr())
19291 report_fatal_error("Cannot use segmented stacks with functions that "
19292 "have nested arguments.");
19296 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19297 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19298 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19299 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19300 DAG.getRegister(Vreg, SPTy));
19302 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19303 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19304 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19306 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19307 unsigned SPReg = RegInfo->getStackRegister();
19308 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19309 Chain = SP.getValue(1);
19312 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19313 DAG.getConstant(-(uint64_t)Align, dl, VT));
19314 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19320 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19321 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19323 SDValue Ops[2] = {Result, Chain};
19324 return DAG.getMergeValues(Ops, dl);
19327 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19328 MachineFunction &MF = DAG.getMachineFunction();
19329 auto PtrVT = getPointerTy(MF.getDataLayout());
19330 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19332 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19335 if (!Subtarget.is64Bit() ||
19336 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
19337 // vastart just stores the address of the VarArgsFrameIndex slot into the
19338 // memory location argument.
19339 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19340 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19341 MachinePointerInfo(SV));
19345 // gp_offset (0 - 6 * 8)
19346 // fp_offset (48 - 48 + 8 * 16)
19347 // overflow_arg_area (point to parameters coming in memory).
19349 SmallVector<SDValue, 8> MemOps;
19350 SDValue FIN = Op.getOperand(1);
19352 SDValue Store = DAG.getStore(
19353 Op.getOperand(0), DL,
19354 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19355 MachinePointerInfo(SV));
19356 MemOps.push_back(Store);
19359 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19360 Store = DAG.getStore(
19361 Op.getOperand(0), DL,
19362 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19363 MachinePointerInfo(SV, 4));
19364 MemOps.push_back(Store);
19366 // Store ptr to overflow_arg_area
19367 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
19368 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19370 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
19371 MemOps.push_back(Store);
19373 // Store ptr to reg_save_area.
19374 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
19375 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
19376 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
19377 Store = DAG.getStore(
19378 Op.getOperand(0), DL, RSFIN, FIN,
19379 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
19380 MemOps.push_back(Store);
19381 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
19384 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
19385 assert(Subtarget.is64Bit() &&
19386 "LowerVAARG only handles 64-bit va_arg!");
19387 assert(Op.getNumOperands() == 4);
19389 MachineFunction &MF = DAG.getMachineFunction();
19390 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
19391 // The Win64 ABI uses char* instead of a structure.
19392 return DAG.expandVAArg(Op.getNode());
19394 SDValue Chain = Op.getOperand(0);
19395 SDValue SrcPtr = Op.getOperand(1);
19396 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19397 unsigned Align = Op.getConstantOperandVal(3);
19400 EVT ArgVT = Op.getNode()->getValueType(0);
19401 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19402 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
19405 // Decide which area this value should be read from.
19406 // TODO: Implement the AMD64 ABI in its entirety. This simple
19407 // selection mechanism works only for the basic types.
19408 if (ArgVT == MVT::f80) {
19409 llvm_unreachable("va_arg for f80 not yet implemented");
19410 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
19411 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
19412 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
19413 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
19415 llvm_unreachable("Unhandled argument type in LowerVAARG");
19418 if (ArgMode == 2) {
19419 // Sanity Check: Make sure using fp_offset makes sense.
19420 assert(!Subtarget.useSoftFloat() &&
19421 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
19422 Subtarget.hasSSE1());
19425 // Insert VAARG_64 node into the DAG
19426 // VAARG_64 returns two values: Variable Argument Address, Chain
19427 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19428 DAG.getConstant(ArgMode, dl, MVT::i8),
19429 DAG.getConstant(Align, dl, MVT::i32)};
19430 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19431 SDValue VAARG = DAG.getMemIntrinsicNode(
19432 X86ISD::VAARG_64, dl,
19433 VTs, InstOps, MVT::i64,
19434 MachinePointerInfo(SV),
19436 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
19437 Chain = VAARG.getValue(1);
19439 // Load the next argument and return it
19440 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19443 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19444 SelectionDAG &DAG) {
19445 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19446 // where a va_list is still an i8*.
19447 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19448 if (Subtarget.isCallingConvWin64(
19449 DAG.getMachineFunction().getFunction().getCallingConv()))
19450 // Probably a Win64 va_copy.
19451 return DAG.expandVACopy(Op.getNode());
19453 SDValue Chain = Op.getOperand(0);
19454 SDValue DstPtr = Op.getOperand(1);
19455 SDValue SrcPtr = Op.getOperand(2);
19456 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19457 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19460 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19461 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19463 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19466 /// Handle vector element shifts where the shift amount is a constant.
19467 /// Takes immediate version of shift as input.
19468 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19469 SDValue SrcOp, uint64_t ShiftAmt,
19470 SelectionDAG &DAG) {
19471 MVT ElementType = VT.getVectorElementType();
19473 // Bitcast the source vector to the output type, this is mainly necessary for
19474 // vXi8/vXi64 shifts.
19475 if (VT != SrcOp.getSimpleValueType())
19476 SrcOp = DAG.getBitcast(VT, SrcOp);
19478 // Fold this packed shift into its first operand if ShiftAmt is 0.
19482 // Check for ShiftAmt >= element width
19483 if (ShiftAmt >= ElementType.getSizeInBits()) {
19484 if (Opc == X86ISD::VSRAI)
19485 ShiftAmt = ElementType.getSizeInBits() - 1;
19487 return DAG.getConstant(0, dl, VT);
19490 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19491 && "Unknown target vector shift-by-constant node");
19493 // Fold this packed vector shift into a build vector if SrcOp is a
19494 // vector of Constants or UNDEFs.
19495 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19496 SmallVector<SDValue, 8> Elts;
19497 unsigned NumElts = SrcOp->getNumOperands();
19498 ConstantSDNode *ND;
19501 default: llvm_unreachable("Unknown opcode!");
19502 case X86ISD::VSHLI:
19503 for (unsigned i=0; i!=NumElts; ++i) {
19504 SDValue CurrentOp = SrcOp->getOperand(i);
19505 if (CurrentOp->isUndef()) {
19506 Elts.push_back(CurrentOp);
19509 ND = cast<ConstantSDNode>(CurrentOp);
19510 const APInt &C = ND->getAPIntValue();
19511 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19514 case X86ISD::VSRLI:
19515 for (unsigned i=0; i!=NumElts; ++i) {
19516 SDValue CurrentOp = SrcOp->getOperand(i);
19517 if (CurrentOp->isUndef()) {
19518 Elts.push_back(CurrentOp);
19521 ND = cast<ConstantSDNode>(CurrentOp);
19522 const APInt &C = ND->getAPIntValue();
19523 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19526 case X86ISD::VSRAI:
19527 for (unsigned i=0; i!=NumElts; ++i) {
19528 SDValue CurrentOp = SrcOp->getOperand(i);
19529 if (CurrentOp->isUndef()) {
19530 Elts.push_back(CurrentOp);
19533 ND = cast<ConstantSDNode>(CurrentOp);
19534 const APInt &C = ND->getAPIntValue();
19535 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19540 return DAG.getBuildVector(VT, dl, Elts);
19543 return DAG.getNode(Opc, dl, VT, SrcOp,
19544 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19547 /// Handle vector element shifts where the shift amount may or may not be a
19548 /// constant. Takes immediate version of shift as input.
19549 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19550 SDValue SrcOp, SDValue ShAmt,
19551 const X86Subtarget &Subtarget,
19552 SelectionDAG &DAG) {
19553 MVT SVT = ShAmt.getSimpleValueType();
19554 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19556 // Catch shift-by-constant.
19557 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19558 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19559 CShAmt->getZExtValue(), DAG);
19561 // Change opcode to non-immediate version
19563 default: llvm_unreachable("Unknown target vector shift node");
19564 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19565 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19566 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19569 // Need to build a vector containing shift amount.
19570 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19571 // +=================+============+=======================================+
19572 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19573 // +=================+============+=======================================+
19574 // | i64 | Yes, No | Use ShAmt as lowest elt |
19575 // | i32 | Yes | zero-extend in-reg |
19576 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19577 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19578 // +=================+============+=======================================+
19580 if (SVT == MVT::i64)
19581 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19582 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19583 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19584 ShAmt = ShAmt.getOperand(0);
19585 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19586 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19587 } else if (Subtarget.hasSSE41() &&
19588 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19589 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19590 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19592 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
19593 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19594 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19597 // The return type has to be a 128-bit type with the same element
19598 // type as the input type.
19599 MVT EltVT = VT.getVectorElementType();
19600 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19602 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19603 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19606 /// \brief Return Mask with the necessary casting or extending
19607 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19608 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19609 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19612 if (isAllOnesConstant(Mask))
19613 return DAG.getConstant(1, dl, MaskVT);
19614 if (X86::isZeroNode(Mask))
19615 return DAG.getConstant(0, dl, MaskVT);
19617 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19618 // Mask should be extended
19619 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19620 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19623 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19624 if (MaskVT == MVT::v64i1) {
19625 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19626 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19628 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19629 DAG.getConstant(0, dl, MVT::i32));
19630 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19631 DAG.getConstant(1, dl, MVT::i32));
19633 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19634 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19636 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19638 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19640 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19641 return DAG.getBitcast(MaskVT,
19642 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19646 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19647 Mask.getSimpleValueType().getSizeInBits());
19648 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19649 // are extracted by EXTRACT_SUBVECTOR.
19650 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19651 DAG.getBitcast(BitcastVT, Mask),
19652 DAG.getIntPtrConstant(0, dl));
19656 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19657 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19658 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19659 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19660 SDValue PreservedSrc,
19661 const X86Subtarget &Subtarget,
19662 SelectionDAG &DAG) {
19663 MVT VT = Op.getSimpleValueType();
19664 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19665 unsigned OpcodeSelect = ISD::VSELECT;
19668 if (isAllOnesConstant(Mask))
19671 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19673 switch (Op.getOpcode()) {
19676 case X86ISD::CMPM_RND:
19677 case X86ISD::CMPMU:
19678 case X86ISD::VPSHUFBITQMB:
19679 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19680 case X86ISD::VFPCLASS:
19681 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19682 case X86ISD::VTRUNC:
19683 case X86ISD::VTRUNCS:
19684 case X86ISD::VTRUNCUS:
19685 case X86ISD::CVTPS2PH:
19686 // We can't use ISD::VSELECT here because it is not always "Legal"
19687 // for the destination type. For example vpmovqb require only AVX512
19688 // and vselect that can operate on byte element type require BWI
19689 OpcodeSelect = X86ISD::SELECT;
19692 if (PreservedSrc.isUndef())
19693 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19694 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19697 /// \brief Creates an SDNode for a predicated scalar operation.
19698 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19699 /// The mask is coming as MVT::i8 and it should be transformed
19700 /// to MVT::v1i1 while lowering masking intrinsics.
19701 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19702 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19703 /// for a scalar instruction.
19704 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19705 SDValue PreservedSrc,
19706 const X86Subtarget &Subtarget,
19707 SelectionDAG &DAG) {
19709 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19710 if (MaskConst->getZExtValue() & 0x1)
19713 MVT VT = Op.getSimpleValueType();
19716 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19717 if (Op.getOpcode() == X86ISD::FSETCCM ||
19718 Op.getOpcode() == X86ISD::FSETCCM_RND)
19719 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19720 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19721 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19723 if (PreservedSrc.isUndef())
19724 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19725 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19728 static int getSEHRegistrationNodeSize(const Function *Fn) {
19729 if (!Fn->hasPersonalityFn())
19730 report_fatal_error(
19731 "querying registration node size for function without personality");
19732 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19733 // WinEHStatePass for the full struct definition.
19734 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19735 case EHPersonality::MSVC_X86SEH: return 24;
19736 case EHPersonality::MSVC_CXX: return 16;
19739 report_fatal_error(
19740 "can only recover FP for 32-bit MSVC EH personality functions");
19743 /// When the MSVC runtime transfers control to us, either to an outlined
19744 /// function or when returning to a parent frame after catching an exception, we
19745 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19746 /// Here's the math:
19747 /// RegNodeBase = EntryEBP - RegNodeSize
19748 /// ParentFP = RegNodeBase - ParentFrameOffset
19749 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19750 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19751 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19752 SDValue EntryEBP) {
19753 MachineFunction &MF = DAG.getMachineFunction();
19756 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19757 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19759 // It's possible that the parent function no longer has a personality function
19760 // if the exceptional code was optimized away, in which case we just return
19761 // the incoming EBP.
19762 if (!Fn->hasPersonalityFn())
19765 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19766 // registration, or the .set_setframe offset.
19767 MCSymbol *OffsetSym =
19768 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19769 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19770 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19771 SDValue ParentFrameOffset =
19772 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19774 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19775 // prologue to RBP in the parent function.
19776 const X86Subtarget &Subtarget =
19777 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19778 if (Subtarget.is64Bit())
19779 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19781 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19782 // RegNodeBase = EntryEBP - RegNodeSize
19783 // ParentFP = RegNodeBase - ParentFrameOffset
19784 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19785 DAG.getConstant(RegNodeSize, dl, PtrVT));
19786 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19789 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
19790 SelectionDAG &DAG) const {
19791 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19792 auto isRoundModeCurDirection = [](SDValue Rnd) {
19793 if (!isa<ConstantSDNode>(Rnd))
19796 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19797 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19801 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19802 MVT VT = Op.getSimpleValueType();
19803 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19805 switch(IntrData->Type) {
19806 case INTR_TYPE_1OP:
19807 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19808 case INTR_TYPE_2OP:
19809 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19811 case INTR_TYPE_3OP:
19812 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19813 Op.getOperand(2), Op.getOperand(3));
19814 case INTR_TYPE_4OP:
19815 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19816 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19817 case INTR_TYPE_1OP_MASK_RM: {
19818 SDValue Src = Op.getOperand(1);
19819 SDValue PassThru = Op.getOperand(2);
19820 SDValue Mask = Op.getOperand(3);
19821 SDValue RoundingMode;
19822 // We always add rounding mode to the Node.
19823 // If the rounding mode is not specified, we add the
19824 // "current direction" mode.
19825 if (Op.getNumOperands() == 4)
19827 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19829 RoundingMode = Op.getOperand(4);
19830 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19831 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19833 Mask, PassThru, Subtarget, DAG);
19835 case INTR_TYPE_1OP_MASK: {
19836 SDValue Src = Op.getOperand(1);
19837 SDValue PassThru = Op.getOperand(2);
19838 SDValue Mask = Op.getOperand(3);
19839 // We add rounding mode to the Node when
19840 // - RM Opcode is specified and
19841 // - RM is not "current direction".
19842 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19843 if (IntrWithRoundingModeOpcode != 0) {
19844 SDValue Rnd = Op.getOperand(4);
19845 if (!isRoundModeCurDirection(Rnd)) {
19846 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19847 dl, Op.getValueType(),
19849 Mask, PassThru, Subtarget, DAG);
19852 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19853 Mask, PassThru, Subtarget, DAG);
19855 case INTR_TYPE_SCALAR_MASK: {
19856 SDValue Src1 = Op.getOperand(1);
19857 SDValue Src2 = Op.getOperand(2);
19858 SDValue passThru = Op.getOperand(3);
19859 SDValue Mask = Op.getOperand(4);
19860 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19861 // There are 2 kinds of intrinsics in this group:
19862 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19863 // (2) With rounding mode and sae - 7 operands.
19864 bool HasRounding = IntrWithRoundingModeOpcode != 0;
19865 if (Op.getNumOperands() == (5U + HasRounding)) {
19867 SDValue Rnd = Op.getOperand(5);
19868 if (!isRoundModeCurDirection(Rnd))
19869 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19870 dl, VT, Src1, Src2, Rnd),
19871 Mask, passThru, Subtarget, DAG);
19873 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19875 Mask, passThru, Subtarget, DAG);
19878 assert(Op.getNumOperands() == (6U + HasRounding) &&
19879 "Unexpected intrinsic form");
19880 SDValue RoundingMode = Op.getOperand(5);
19882 SDValue Sae = Op.getOperand(6);
19883 if (!isRoundModeCurDirection(Sae))
19884 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19885 dl, VT, Src1, Src2,
19886 RoundingMode, Sae),
19887 Mask, passThru, Subtarget, DAG);
19889 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19890 Src2, RoundingMode),
19891 Mask, passThru, Subtarget, DAG);
19893 case INTR_TYPE_SCALAR_MASK_RM: {
19894 SDValue Src1 = Op.getOperand(1);
19895 SDValue Src2 = Op.getOperand(2);
19896 SDValue Src0 = Op.getOperand(3);
19897 SDValue Mask = Op.getOperand(4);
19898 // There are 2 kinds of intrinsics in this group:
19899 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19900 // (2) With rounding mode and sae - 7 operands.
19901 if (Op.getNumOperands() == 6) {
19902 SDValue Sae = Op.getOperand(5);
19903 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19905 Mask, Src0, Subtarget, DAG);
19907 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19908 SDValue RoundingMode = Op.getOperand(5);
19909 SDValue Sae = Op.getOperand(6);
19910 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19911 RoundingMode, Sae),
19912 Mask, Src0, Subtarget, DAG);
19914 case INTR_TYPE_2OP_MASK:
19915 case INTR_TYPE_2OP_IMM8_MASK: {
19916 SDValue Src1 = Op.getOperand(1);
19917 SDValue Src2 = Op.getOperand(2);
19918 SDValue PassThru = Op.getOperand(3);
19919 SDValue Mask = Op.getOperand(4);
19921 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19922 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19924 // We specify 2 possible opcodes for intrinsics with rounding modes.
19925 // First, we check if the intrinsic may have non-default rounding mode,
19926 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19927 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19928 if (IntrWithRoundingModeOpcode != 0) {
19929 SDValue Rnd = Op.getOperand(5);
19930 if (!isRoundModeCurDirection(Rnd)) {
19931 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19932 dl, Op.getValueType(),
19934 Mask, PassThru, Subtarget, DAG);
19937 // TODO: Intrinsics should have fast-math-flags to propagate.
19938 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19939 Mask, PassThru, Subtarget, DAG);
19941 case INTR_TYPE_2OP_MASK_RM: {
19942 SDValue Src1 = Op.getOperand(1);
19943 SDValue Src2 = Op.getOperand(2);
19944 SDValue PassThru = Op.getOperand(3);
19945 SDValue Mask = Op.getOperand(4);
19946 // We specify 2 possible modes for intrinsics, with/without rounding
19948 // First, we check if the intrinsic have rounding mode (6 operands),
19949 // if not, we set rounding mode to "current".
19951 if (Op.getNumOperands() == 6)
19952 Rnd = Op.getOperand(5);
19954 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19955 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19957 Mask, PassThru, Subtarget, DAG);
19959 case INTR_TYPE_3OP_SCALAR_MASK: {
19960 SDValue Src1 = Op.getOperand(1);
19961 SDValue Src2 = Op.getOperand(2);
19962 SDValue Src3 = Op.getOperand(3);
19963 SDValue PassThru = Op.getOperand(4);
19964 SDValue Mask = Op.getOperand(5);
19966 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19967 if (IntrWithRoundingModeOpcode != 0) {
19968 SDValue Rnd = Op.getOperand(6);
19969 if (!isRoundModeCurDirection(Rnd))
19970 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19971 dl, VT, Src1, Src2, Src3, Rnd),
19972 Mask, PassThru, Subtarget, DAG);
19974 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19976 Mask, PassThru, Subtarget, DAG);
19978 case INTR_TYPE_3OP_MASK_RM: {
19979 SDValue Src1 = Op.getOperand(1);
19980 SDValue Src2 = Op.getOperand(2);
19981 SDValue Imm = Op.getOperand(3);
19982 SDValue PassThru = Op.getOperand(4);
19983 SDValue Mask = Op.getOperand(5);
19984 // We specify 2 possible modes for intrinsics, with/without rounding
19986 // First, we check if the intrinsic have rounding mode (7 operands),
19987 // if not, we set rounding mode to "current".
19989 if (Op.getNumOperands() == 7)
19990 Rnd = Op.getOperand(6);
19992 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19993 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19994 Src1, Src2, Imm, Rnd),
19995 Mask, PassThru, Subtarget, DAG);
19997 case INTR_TYPE_3OP_IMM8_MASK:
19998 case INTR_TYPE_3OP_MASK: {
19999 SDValue Src1 = Op.getOperand(1);
20000 SDValue Src2 = Op.getOperand(2);
20001 SDValue Src3 = Op.getOperand(3);
20002 SDValue PassThru = Op.getOperand(4);
20003 SDValue Mask = Op.getOperand(5);
20005 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
20006 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20008 // We specify 2 possible opcodes for intrinsics with rounding modes.
20009 // First, we check if the intrinsic may have non-default rounding mode,
20010 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20011 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20012 if (IntrWithRoundingModeOpcode != 0) {
20013 SDValue Rnd = Op.getOperand(6);
20014 if (!isRoundModeCurDirection(Rnd)) {
20015 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20016 dl, Op.getValueType(),
20017 Src1, Src2, Src3, Rnd),
20018 Mask, PassThru, Subtarget, DAG);
20021 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20023 Mask, PassThru, Subtarget, DAG);
20025 case VPERM_2OP_MASK : {
20026 SDValue Src1 = Op.getOperand(1);
20027 SDValue Src2 = Op.getOperand(2);
20028 SDValue PassThru = Op.getOperand(3);
20029 SDValue Mask = Op.getOperand(4);
20031 // Swap Src1 and Src2 in the node creation
20032 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
20033 Mask, PassThru, Subtarget, DAG);
20035 case VPERM_3OP_MASKZ:
20036 case VPERM_3OP_MASK:{
20037 MVT VT = Op.getSimpleValueType();
20038 // Src2 is the PassThru
20039 SDValue Src1 = Op.getOperand(1);
20040 // PassThru needs to be the same type as the destination in order
20041 // to pattern match correctly.
20042 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
20043 SDValue Src3 = Op.getOperand(3);
20044 SDValue Mask = Op.getOperand(4);
20045 SDValue PassThru = SDValue();
20047 // set PassThru element
20048 if (IntrData->Type == VPERM_3OP_MASKZ)
20049 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20053 // Swap Src1 and Src2 in the node creation
20054 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20055 dl, Op.getValueType(),
20057 Mask, PassThru, Subtarget, DAG);
20061 case FMA_OP_MASK: {
20062 SDValue Src1 = Op.getOperand(1);
20063 SDValue Src2 = Op.getOperand(2);
20064 SDValue Src3 = Op.getOperand(3);
20065 SDValue Mask = Op.getOperand(4);
20066 MVT VT = Op.getSimpleValueType();
20067 SDValue PassThru = SDValue();
20069 // set PassThru element
20070 if (IntrData->Type == FMA_OP_MASKZ)
20071 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20072 else if (IntrData->Type == FMA_OP_MASK3)
20077 // We specify 2 possible opcodes for intrinsics with rounding modes.
20078 // First, we check if the intrinsic may have non-default rounding mode,
20079 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20080 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20081 if (IntrWithRoundingModeOpcode != 0) {
20082 SDValue Rnd = Op.getOperand(5);
20083 if (!isRoundModeCurDirection(Rnd))
20084 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20085 dl, Op.getValueType(),
20086 Src1, Src2, Src3, Rnd),
20087 Mask, PassThru, Subtarget, DAG);
20089 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20090 dl, Op.getValueType(),
20092 Mask, PassThru, Subtarget, DAG);
20094 case FMA_OP_SCALAR_MASK:
20095 case FMA_OP_SCALAR_MASK3:
20096 case FMA_OP_SCALAR_MASKZ: {
20097 SDValue Src1 = Op.getOperand(1);
20098 SDValue Src2 = Op.getOperand(2);
20099 SDValue Src3 = Op.getOperand(3);
20100 SDValue Mask = Op.getOperand(4);
20101 MVT VT = Op.getSimpleValueType();
20102 SDValue PassThru = SDValue();
20104 // set PassThru element
20105 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
20106 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20107 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
20112 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20113 if (IntrWithRoundingModeOpcode != 0) {
20114 SDValue Rnd = Op.getOperand(5);
20115 if (!isRoundModeCurDirection(Rnd))
20116 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
20117 Op.getValueType(), Src1, Src2,
20119 Mask, PassThru, Subtarget, DAG);
20122 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
20123 Op.getValueType(), Src1, Src2,
20125 Mask, PassThru, Subtarget, DAG);
20127 case IFMA_OP_MASKZ:
20128 case IFMA_OP_MASK: {
20129 SDValue Src1 = Op.getOperand(1);
20130 SDValue Src2 = Op.getOperand(2);
20131 SDValue Src3 = Op.getOperand(3);
20132 SDValue Mask = Op.getOperand(4);
20133 MVT VT = Op.getSimpleValueType();
20134 SDValue PassThru = Src1;
20136 // set PassThru element
20137 if (IntrData->Type == IFMA_OP_MASKZ)
20138 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20140 // Node we need to swizzle the operands to pass the multiply operands
20142 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20143 dl, Op.getValueType(),
20145 Mask, PassThru, Subtarget, DAG);
20147 case TERLOG_OP_MASK:
20148 case TERLOG_OP_MASKZ: {
20149 SDValue Src1 = Op.getOperand(1);
20150 SDValue Src2 = Op.getOperand(2);
20151 SDValue Src3 = Op.getOperand(3);
20152 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
20153 SDValue Mask = Op.getOperand(5);
20154 MVT VT = Op.getSimpleValueType();
20155 SDValue PassThru = Src1;
20156 // Set PassThru element.
20157 if (IntrData->Type == TERLOG_OP_MASKZ)
20158 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20160 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20161 Src1, Src2, Src3, Src4),
20162 Mask, PassThru, Subtarget, DAG);
20165 // ISD::FP_ROUND has a second argument that indicates if the truncation
20166 // does not change the value. Set it to 0 since it can change.
20167 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20168 DAG.getIntPtrConstant(0, dl));
20169 case CVTPD2PS_MASK: {
20170 SDValue Src = Op.getOperand(1);
20171 SDValue PassThru = Op.getOperand(2);
20172 SDValue Mask = Op.getOperand(3);
20173 // We add rounding mode to the Node when
20174 // - RM Opcode is specified and
20175 // - RM is not "current direction".
20176 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20177 if (IntrWithRoundingModeOpcode != 0) {
20178 SDValue Rnd = Op.getOperand(4);
20179 if (!isRoundModeCurDirection(Rnd)) {
20180 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20181 dl, Op.getValueType(),
20183 Mask, PassThru, Subtarget, DAG);
20186 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20187 // ISD::FP_ROUND has a second argument that indicates if the truncation
20188 // does not change the value. Set it to 0 since it can change.
20189 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20190 DAG.getIntPtrConstant(0, dl)),
20191 Mask, PassThru, Subtarget, DAG);
20194 // FPclass intrinsics with mask
20195 SDValue Src1 = Op.getOperand(1);
20196 MVT VT = Src1.getSimpleValueType();
20197 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20198 SDValue Imm = Op.getOperand(2);
20199 SDValue Mask = Op.getOperand(3);
20200 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20201 Mask.getSimpleValueType().getSizeInBits());
20202 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20203 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
20205 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20206 DAG.getUNDEF(BitcastVT), FPclassMask,
20207 DAG.getIntPtrConstant(0, dl));
20208 return DAG.getBitcast(Op.getValueType(), Res);
20211 SDValue Src1 = Op.getOperand(1);
20212 SDValue Imm = Op.getOperand(2);
20213 SDValue Mask = Op.getOperand(3);
20214 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20215 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20217 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
20218 DAG.getIntPtrConstant(0, dl));
20221 case CMP_MASK_CC: {
20222 // Comparison intrinsics with masks.
20223 // Example of transformation:
20224 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20225 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20227 // (v8i1 (insert_subvector undef,
20228 // (v2i1 (and (PCMPEQM %a, %b),
20229 // (extract_subvector
20230 // (v8i1 (bitcast %mask)), 0))), 0))))
20231 MVT VT = Op.getOperand(1).getSimpleValueType();
20232 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20233 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20234 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20235 Mask.getSimpleValueType().getSizeInBits());
20237 if (IntrData->Type == CMP_MASK_CC) {
20238 SDValue CC = Op.getOperand(3);
20239 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20240 // We specify 2 possible opcodes for intrinsics with rounding modes.
20241 // First, we check if the intrinsic may have non-default rounding mode,
20242 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20243 if (IntrData->Opc1 != 0) {
20244 SDValue Rnd = Op.getOperand(5);
20245 if (!isRoundModeCurDirection(Rnd))
20246 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20247 Op.getOperand(2), CC, Rnd);
20249 //default rounding mode
20251 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20252 Op.getOperand(2), CC);
20255 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
20256 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20259 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20261 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20262 DAG.getUNDEF(BitcastVT), CmpMask,
20263 DAG.getIntPtrConstant(0, dl));
20264 return DAG.getBitcast(Op.getValueType(), Res);
20266 case CMP_MASK_SCALAR_CC: {
20267 SDValue Src1 = Op.getOperand(1);
20268 SDValue Src2 = Op.getOperand(2);
20269 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20270 SDValue Mask = Op.getOperand(4);
20273 if (IntrData->Opc1 != 0) {
20274 SDValue Rnd = Op.getOperand(5);
20275 if (!isRoundModeCurDirection(Rnd))
20276 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20278 //default rounding mode
20280 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20282 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20284 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
20285 DAG.getIntPtrConstant(0, dl));
20287 case COMI: { // Comparison intrinsics
20288 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20289 SDValue LHS = Op.getOperand(1);
20290 SDValue RHS = Op.getOperand(2);
20291 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20292 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20295 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20296 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20297 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20298 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20301 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20302 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20303 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20304 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20307 case ISD::SETGT: // (CF = 0 and ZF = 0)
20308 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20310 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20311 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20314 case ISD::SETGE: // CF = 0
20315 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20317 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20318 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20321 llvm_unreachable("Unexpected illegal condition!");
20323 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20325 case COMI_RM: { // Comparison intrinsics with Sae
20326 SDValue LHS = Op.getOperand(1);
20327 SDValue RHS = Op.getOperand(2);
20328 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20329 SDValue Sae = Op.getOperand(4);
20332 if (isRoundModeCurDirection(Sae))
20333 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20334 DAG.getConstant(CondVal, dl, MVT::i8));
20336 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20337 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20338 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
20339 DAG.getIntPtrConstant(0, dl));
20342 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20343 Op.getOperand(1), Op.getOperand(2), Subtarget,
20345 case COMPRESS_EXPAND_IN_REG: {
20346 SDValue Mask = Op.getOperand(3);
20347 SDValue DataToCompress = Op.getOperand(1);
20348 SDValue PassThru = Op.getOperand(2);
20349 if (isAllOnesConstant(Mask)) // return data as is
20350 return Op.getOperand(1);
20352 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20354 Mask, PassThru, Subtarget, DAG);
20357 SDValue Mask = Op.getOperand(1);
20358 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20359 Mask.getSimpleValueType().getSizeInBits());
20360 Mask = DAG.getBitcast(MaskVT, Mask);
20361 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
20364 MVT VT = Op.getSimpleValueType();
20365 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20367 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20368 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20369 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
20370 return DAG.getBitcast(VT, Res);
20373 case FIXUPIMMS_MASKZ:
20375 case FIXUPIMM_MASKZ:{
20376 SDValue Src1 = Op.getOperand(1);
20377 SDValue Src2 = Op.getOperand(2);
20378 SDValue Src3 = Op.getOperand(3);
20379 SDValue Imm = Op.getOperand(4);
20380 SDValue Mask = Op.getOperand(5);
20381 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20382 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20383 // We specify 2 possible modes for intrinsics, with/without rounding
20385 // First, we check if the intrinsic have rounding mode (7 operands),
20386 // if not, we set rounding mode to "current".
20388 if (Op.getNumOperands() == 7)
20389 Rnd = Op.getOperand(6);
20391 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20392 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20393 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20394 Src1, Src2, Src3, Imm, Rnd),
20395 Mask, Passthru, Subtarget, DAG);
20396 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20397 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20398 Src1, Src2, Src3, Imm, Rnd),
20399 Mask, Passthru, Subtarget, DAG);
20401 case CONVERT_TO_MASK: {
20402 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
20403 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
20404 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20406 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
20408 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20409 DAG.getUNDEF(BitcastVT), CvtMask,
20410 DAG.getIntPtrConstant(0, dl));
20411 return DAG.getBitcast(Op.getValueType(), Res);
20414 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20415 // Clear the upper bits of the rounding immediate so that the legacy
20416 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20417 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20419 DAG.getConstant(0xf, dl, MVT::i32));
20420 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20421 Op.getOperand(1), RoundingMode);
20424 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
20425 // Clear the upper bits of the rounding immediate so that the legacy
20426 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20427 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20429 DAG.getConstant(0xf, dl, MVT::i32));
20430 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20431 Op.getOperand(1), Op.getOperand(2), RoundingMode);
20439 default: return SDValue(); // Don't custom lower most intrinsics.
20441 case Intrinsic::x86_avx2_permd:
20442 case Intrinsic::x86_avx2_permps:
20443 // Operands intentionally swapped. Mask is last operand to intrinsic,
20444 // but second operand for node/instruction.
20445 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20446 Op.getOperand(2), Op.getOperand(1));
20448 // ptest and testp intrinsics. The intrinsic these come from are designed to
20449 // return an integer value, not just an instruction so lower it to the ptest
20450 // or testp pattern and a setcc for the result.
20451 case Intrinsic::x86_sse41_ptestz:
20452 case Intrinsic::x86_sse41_ptestc:
20453 case Intrinsic::x86_sse41_ptestnzc:
20454 case Intrinsic::x86_avx_ptestz_256:
20455 case Intrinsic::x86_avx_ptestc_256:
20456 case Intrinsic::x86_avx_ptestnzc_256:
20457 case Intrinsic::x86_avx_vtestz_ps:
20458 case Intrinsic::x86_avx_vtestc_ps:
20459 case Intrinsic::x86_avx_vtestnzc_ps:
20460 case Intrinsic::x86_avx_vtestz_pd:
20461 case Intrinsic::x86_avx_vtestc_pd:
20462 case Intrinsic::x86_avx_vtestnzc_pd:
20463 case Intrinsic::x86_avx_vtestz_ps_256:
20464 case Intrinsic::x86_avx_vtestc_ps_256:
20465 case Intrinsic::x86_avx_vtestnzc_ps_256:
20466 case Intrinsic::x86_avx_vtestz_pd_256:
20467 case Intrinsic::x86_avx_vtestc_pd_256:
20468 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20469 bool IsTestPacked = false;
20470 X86::CondCode X86CC;
20472 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20473 case Intrinsic::x86_avx_vtestz_ps:
20474 case Intrinsic::x86_avx_vtestz_pd:
20475 case Intrinsic::x86_avx_vtestz_ps_256:
20476 case Intrinsic::x86_avx_vtestz_pd_256:
20477 IsTestPacked = true;
20479 case Intrinsic::x86_sse41_ptestz:
20480 case Intrinsic::x86_avx_ptestz_256:
20482 X86CC = X86::COND_E;
20484 case Intrinsic::x86_avx_vtestc_ps:
20485 case Intrinsic::x86_avx_vtestc_pd:
20486 case Intrinsic::x86_avx_vtestc_ps_256:
20487 case Intrinsic::x86_avx_vtestc_pd_256:
20488 IsTestPacked = true;
20490 case Intrinsic::x86_sse41_ptestc:
20491 case Intrinsic::x86_avx_ptestc_256:
20493 X86CC = X86::COND_B;
20495 case Intrinsic::x86_avx_vtestnzc_ps:
20496 case Intrinsic::x86_avx_vtestnzc_pd:
20497 case Intrinsic::x86_avx_vtestnzc_ps_256:
20498 case Intrinsic::x86_avx_vtestnzc_pd_256:
20499 IsTestPacked = true;
20501 case Intrinsic::x86_sse41_ptestnzc:
20502 case Intrinsic::x86_avx_ptestnzc_256:
20504 X86CC = X86::COND_A;
20508 SDValue LHS = Op.getOperand(1);
20509 SDValue RHS = Op.getOperand(2);
20510 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20511 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20512 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20513 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20515 case Intrinsic::x86_avx512_kortestz_w:
20516 case Intrinsic::x86_avx512_kortestc_w: {
20517 X86::CondCode X86CC =
20518 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20519 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20520 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20521 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20522 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20523 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20526 case Intrinsic::x86_avx512_knot_w: {
20527 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20528 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20529 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20530 return DAG.getBitcast(MVT::i16, Res);
20533 case Intrinsic::x86_avx512_kandn_w: {
20534 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20535 // Invert LHS for the not.
20536 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20537 DAG.getConstant(1, dl, MVT::v16i1));
20538 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20539 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20540 return DAG.getBitcast(MVT::i16, Res);
20543 case Intrinsic::x86_avx512_kxnor_w: {
20544 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20545 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20546 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20547 // Invert result for the not.
20548 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20549 DAG.getConstant(1, dl, MVT::v16i1));
20550 return DAG.getBitcast(MVT::i16, Res);
20553 case Intrinsic::x86_sse42_pcmpistria128:
20554 case Intrinsic::x86_sse42_pcmpestria128:
20555 case Intrinsic::x86_sse42_pcmpistric128:
20556 case Intrinsic::x86_sse42_pcmpestric128:
20557 case Intrinsic::x86_sse42_pcmpistrio128:
20558 case Intrinsic::x86_sse42_pcmpestrio128:
20559 case Intrinsic::x86_sse42_pcmpistris128:
20560 case Intrinsic::x86_sse42_pcmpestris128:
20561 case Intrinsic::x86_sse42_pcmpistriz128:
20562 case Intrinsic::x86_sse42_pcmpestriz128: {
20564 X86::CondCode X86CC;
20566 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20567 case Intrinsic::x86_sse42_pcmpistria128:
20568 Opcode = X86ISD::PCMPISTRI;
20569 X86CC = X86::COND_A;
20571 case Intrinsic::x86_sse42_pcmpestria128:
20572 Opcode = X86ISD::PCMPESTRI;
20573 X86CC = X86::COND_A;
20575 case Intrinsic::x86_sse42_pcmpistric128:
20576 Opcode = X86ISD::PCMPISTRI;
20577 X86CC = X86::COND_B;
20579 case Intrinsic::x86_sse42_pcmpestric128:
20580 Opcode = X86ISD::PCMPESTRI;
20581 X86CC = X86::COND_B;
20583 case Intrinsic::x86_sse42_pcmpistrio128:
20584 Opcode = X86ISD::PCMPISTRI;
20585 X86CC = X86::COND_O;
20587 case Intrinsic::x86_sse42_pcmpestrio128:
20588 Opcode = X86ISD::PCMPESTRI;
20589 X86CC = X86::COND_O;
20591 case Intrinsic::x86_sse42_pcmpistris128:
20592 Opcode = X86ISD::PCMPISTRI;
20593 X86CC = X86::COND_S;
20595 case Intrinsic::x86_sse42_pcmpestris128:
20596 Opcode = X86ISD::PCMPESTRI;
20597 X86CC = X86::COND_S;
20599 case Intrinsic::x86_sse42_pcmpistriz128:
20600 Opcode = X86ISD::PCMPISTRI;
20601 X86CC = X86::COND_E;
20603 case Intrinsic::x86_sse42_pcmpestriz128:
20604 Opcode = X86ISD::PCMPESTRI;
20605 X86CC = X86::COND_E;
20608 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20609 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20610 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20611 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20612 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20615 case Intrinsic::x86_sse42_pcmpistri128:
20616 case Intrinsic::x86_sse42_pcmpestri128: {
20618 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20619 Opcode = X86ISD::PCMPISTRI;
20621 Opcode = X86ISD::PCMPESTRI;
20623 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20624 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20625 return DAG.getNode(Opcode, dl, VTs, NewOps);
20628 case Intrinsic::eh_sjlj_lsda: {
20629 MachineFunction &MF = DAG.getMachineFunction();
20630 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20631 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20632 auto &Context = MF.getMMI().getContext();
20633 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20634 Twine(MF.getFunctionNumber()));
20635 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
20636 DAG.getMCSymbol(S, PtrVT));
20639 case Intrinsic::x86_seh_lsda: {
20640 // Compute the symbol for the LSDA. We know it'll get emitted later.
20641 MachineFunction &MF = DAG.getMachineFunction();
20642 SDValue Op1 = Op.getOperand(1);
20643 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20644 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20645 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20647 // Generate a simple absolute symbol reference. This intrinsic is only
20648 // supported on 32-bit Windows, which isn't PIC.
20649 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20650 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20653 case Intrinsic::x86_seh_recoverfp: {
20654 SDValue FnOp = Op.getOperand(1);
20655 SDValue IncomingFPOp = Op.getOperand(2);
20656 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20657 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20659 report_fatal_error(
20660 "llvm.x86.seh.recoverfp must take a function as the first argument");
20661 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20664 case Intrinsic::localaddress: {
20665 // Returns one of the stack, base, or frame pointer registers, depending on
20666 // which is used to reference local variables.
20667 MachineFunction &MF = DAG.getMachineFunction();
20668 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20670 if (RegInfo->hasBasePointer(MF))
20671 Reg = RegInfo->getBaseRegister();
20672 else // This function handles the SP or FP case.
20673 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20674 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20679 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20680 SDValue Src, SDValue Mask, SDValue Base,
20681 SDValue Index, SDValue ScaleOp, SDValue Chain,
20682 const X86Subtarget &Subtarget) {
20684 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20685 // Scale must be constant.
20688 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20689 EVT MaskVT = Mask.getValueType();
20690 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20691 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20692 SDValue Segment = DAG.getRegister(0, MVT::i32);
20693 // If source is undef or we know it won't be used, use a zero vector
20694 // to break register dependency.
20695 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20696 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20697 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20698 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20699 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20700 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20701 return DAG.getMergeValues(RetOps, dl);
20704 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20705 SDValue Src, SDValue Mask, SDValue Base,
20706 SDValue Index, SDValue ScaleOp, SDValue Chain,
20707 const X86Subtarget &Subtarget) {
20709 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20710 // Scale must be constant.
20713 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20714 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20715 Index.getSimpleValueType().getVectorNumElements());
20717 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20718 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20719 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20720 SDValue Segment = DAG.getRegister(0, MVT::i32);
20721 // If source is undef or we know it won't be used, use a zero vector
20722 // to break register dependency.
20723 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20724 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20725 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20726 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20727 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20728 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20729 return DAG.getMergeValues(RetOps, dl);
20732 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20733 SDValue Src, SDValue Mask, SDValue Base,
20734 SDValue Index, SDValue ScaleOp, SDValue Chain,
20735 const X86Subtarget &Subtarget) {
20737 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20738 // Scale must be constant.
20741 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20742 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20743 SDValue Segment = DAG.getRegister(0, MVT::i32);
20744 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20745 Index.getSimpleValueType().getVectorNumElements());
20747 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20748 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20749 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20750 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20751 return SDValue(Res, 1);
20754 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20755 SDValue Mask, SDValue Base, SDValue Index,
20756 SDValue ScaleOp, SDValue Chain,
20757 const X86Subtarget &Subtarget) {
20759 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20760 // Scale must be constant.
20763 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20764 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20765 SDValue Segment = DAG.getRegister(0, MVT::i32);
20767 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20768 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20769 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20770 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20771 return SDValue(Res, 0);
20774 /// Handles the lowering of builtin intrinsic that return the value
20775 /// of the extended control register.
20776 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20778 const X86Subtarget &Subtarget,
20779 SmallVectorImpl<SDValue> &Results) {
20780 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20781 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20784 // The ECX register is used to select the index of the XCR register to
20787 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20788 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20789 Chain = SDValue(N1, 0);
20791 // Reads the content of XCR and returns it in registers EDX:EAX.
20792 if (Subtarget.is64Bit()) {
20793 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20794 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20797 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20798 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20801 Chain = HI.getValue(1);
20803 if (Subtarget.is64Bit()) {
20804 // Merge the two 32-bit values into a 64-bit one..
20805 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20806 DAG.getConstant(32, DL, MVT::i8));
20807 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20808 Results.push_back(Chain);
20812 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20813 SDValue Ops[] = { LO, HI };
20814 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20815 Results.push_back(Pair);
20816 Results.push_back(Chain);
20819 /// Handles the lowering of builtin intrinsics that read performance monitor
20820 /// counters (x86_rdpmc).
20821 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20823 const X86Subtarget &Subtarget,
20824 SmallVectorImpl<SDValue> &Results) {
20825 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20826 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20829 // The ECX register is used to select the index of the performance counter
20831 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20833 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20835 // Reads the content of a 64-bit performance counter and returns it in the
20836 // registers EDX:EAX.
20837 if (Subtarget.is64Bit()) {
20838 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20839 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20842 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20843 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20846 Chain = HI.getValue(1);
20848 if (Subtarget.is64Bit()) {
20849 // The EAX register is loaded with the low-order 32 bits. The EDX register
20850 // is loaded with the supported high-order bits of the counter.
20851 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20852 DAG.getConstant(32, DL, MVT::i8));
20853 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20854 Results.push_back(Chain);
20858 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20859 SDValue Ops[] = { LO, HI };
20860 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20861 Results.push_back(Pair);
20862 Results.push_back(Chain);
20865 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20866 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20867 /// READCYCLECOUNTER nodes.
20868 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20870 const X86Subtarget &Subtarget,
20871 SmallVectorImpl<SDValue> &Results) {
20872 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20873 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20876 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20877 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20878 // and the EAX register is loaded with the low-order 32 bits.
20879 if (Subtarget.is64Bit()) {
20880 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20881 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20884 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20885 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20888 SDValue Chain = HI.getValue(1);
20890 if (Opcode == X86ISD::RDTSCP_DAG) {
20891 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20893 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20894 // the ECX register. Add 'ecx' explicitly to the chain.
20895 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20897 // Explicitly store the content of ECX at the location passed in input
20898 // to the 'rdtscp' intrinsic.
20899 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20900 MachinePointerInfo());
20903 if (Subtarget.is64Bit()) {
20904 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20905 // the EAX register is loaded with the low-order 32 bits.
20906 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20907 DAG.getConstant(32, DL, MVT::i8));
20908 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20909 Results.push_back(Chain);
20913 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20914 SDValue Ops[] = { LO, HI };
20915 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20916 Results.push_back(Pair);
20917 Results.push_back(Chain);
20920 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20921 SelectionDAG &DAG) {
20922 SmallVector<SDValue, 2> Results;
20924 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20926 return DAG.getMergeValues(Results, DL);
20929 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20930 MachineFunction &MF = DAG.getMachineFunction();
20931 SDValue Chain = Op.getOperand(0);
20932 SDValue RegNode = Op.getOperand(2);
20933 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20935 report_fatal_error("EH registrations only live in functions using WinEH");
20937 // Cast the operand to an alloca, and remember the frame index.
20938 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20940 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20941 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20943 // Return the chain operand without making any DAG nodes.
20947 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20948 MachineFunction &MF = DAG.getMachineFunction();
20949 SDValue Chain = Op.getOperand(0);
20950 SDValue EHGuard = Op.getOperand(2);
20951 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20953 report_fatal_error("EHGuard only live in functions using WinEH");
20955 // Cast the operand to an alloca, and remember the frame index.
20956 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20958 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20959 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20961 // Return the chain operand without making any DAG nodes.
20965 /// Emit Truncating Store with signed or unsigned saturation.
20967 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20968 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20969 SelectionDAG &DAG) {
20971 SDVTList VTs = DAG.getVTList(MVT::Other);
20972 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20973 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20975 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20976 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20979 /// Emit Masked Truncating Store with signed or unsigned saturation.
20981 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20982 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20983 MachineMemOperand *MMO, SelectionDAG &DAG) {
20985 SDVTList VTs = DAG.getVTList(MVT::Other);
20986 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20988 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20989 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20992 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20993 SelectionDAG &DAG) {
20994 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20996 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20999 case llvm::Intrinsic::x86_seh_ehregnode:
21000 return MarkEHRegistrationNode(Op, DAG);
21001 case llvm::Intrinsic::x86_seh_ehguard:
21002 return MarkEHGuard(Op, DAG);
21003 case llvm::Intrinsic::x86_flags_read_u32:
21004 case llvm::Intrinsic::x86_flags_read_u64:
21005 case llvm::Intrinsic::x86_flags_write_u32:
21006 case llvm::Intrinsic::x86_flags_write_u64: {
21007 // We need a frame pointer because this will get lowered to a PUSH/POP
21009 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21010 MFI.setHasCopyImplyingStackAdjustment(true);
21011 // Don't do anything here, we will expand these intrinsics out later
21012 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21015 case Intrinsic::x86_lwpins32:
21016 case Intrinsic::x86_lwpins64: {
21018 SDValue Chain = Op->getOperand(0);
21019 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21021 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
21022 Op->getOperand(3), Op->getOperand(4));
21023 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
21024 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21025 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21026 LwpIns.getValue(1));
21033 switch(IntrData->Type) {
21034 default: llvm_unreachable("Unknown Intrinsic Type");
21037 // Emit the node with the right value type.
21038 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21039 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21041 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21042 // Otherwise return the value from Rand, which is always 0, casted to i32.
21043 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21044 DAG.getConstant(1, dl, Op->getValueType(1)),
21045 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21046 SDValue(Result.getNode(), 1) };
21047 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21049 // Return { result, isValid, chain }.
21050 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21051 SDValue(Result.getNode(), 2));
21053 case GATHER_AVX2: {
21054 SDValue Chain = Op.getOperand(0);
21055 SDValue Src = Op.getOperand(2);
21056 SDValue Base = Op.getOperand(3);
21057 SDValue Index = Op.getOperand(4);
21058 SDValue Mask = Op.getOperand(5);
21059 SDValue Scale = Op.getOperand(6);
21060 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21061 Scale, Chain, Subtarget);
21064 //gather(v1, mask, index, base, scale);
21065 SDValue Chain = Op.getOperand(0);
21066 SDValue Src = Op.getOperand(2);
21067 SDValue Base = Op.getOperand(3);
21068 SDValue Index = Op.getOperand(4);
21069 SDValue Mask = Op.getOperand(5);
21070 SDValue Scale = Op.getOperand(6);
21071 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21075 //scatter(base, mask, index, v1, scale);
21076 SDValue Chain = Op.getOperand(0);
21077 SDValue Base = Op.getOperand(2);
21078 SDValue Mask = Op.getOperand(3);
21079 SDValue Index = Op.getOperand(4);
21080 SDValue Src = Op.getOperand(5);
21081 SDValue Scale = Op.getOperand(6);
21082 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21083 Scale, Chain, Subtarget);
21086 SDValue Hint = Op.getOperand(6);
21087 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21088 assert((HintVal == 2 || HintVal == 3) &&
21089 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21090 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21091 SDValue Chain = Op.getOperand(0);
21092 SDValue Mask = Op.getOperand(2);
21093 SDValue Index = Op.getOperand(3);
21094 SDValue Base = Op.getOperand(4);
21095 SDValue Scale = Op.getOperand(5);
21096 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21099 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21101 SmallVector<SDValue, 2> Results;
21102 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21104 return DAG.getMergeValues(Results, dl);
21106 // Read Performance Monitoring Counters.
21108 SmallVector<SDValue, 2> Results;
21109 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21110 return DAG.getMergeValues(Results, dl);
21112 // Get Extended Control Register.
21114 SmallVector<SDValue, 2> Results;
21115 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21116 return DAG.getMergeValues(Results, dl);
21118 // XTEST intrinsics.
21120 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21121 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21123 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21124 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21125 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21126 Ret, SDValue(InTrans.getNode(), 1));
21130 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21131 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
21132 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21133 DAG.getConstant(-1, dl, MVT::i8));
21134 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21135 Op.getOperand(4), GenCF.getValue(1));
21136 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21137 Op.getOperand(5), MachinePointerInfo());
21138 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21139 SDValue Results[] = { SetCC, Store };
21140 return DAG.getMergeValues(Results, dl);
21142 case COMPRESS_TO_MEM: {
21143 SDValue Mask = Op.getOperand(4);
21144 SDValue DataToCompress = Op.getOperand(3);
21145 SDValue Addr = Op.getOperand(2);
21146 SDValue Chain = Op.getOperand(0);
21147 MVT VT = DataToCompress.getSimpleValueType();
21149 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21150 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21152 if (isAllOnesConstant(Mask)) // return just a store
21153 return DAG.getStore(Chain, dl, DataToCompress, Addr,
21154 MemIntr->getMemOperand());
21156 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21157 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21159 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
21160 MemIntr->getMemOperand(),
21161 false /* truncating */, true /* compressing */);
21163 case TRUNCATE_TO_MEM_VI8:
21164 case TRUNCATE_TO_MEM_VI16:
21165 case TRUNCATE_TO_MEM_VI32: {
21166 SDValue Mask = Op.getOperand(4);
21167 SDValue DataToTruncate = Op.getOperand(3);
21168 SDValue Addr = Op.getOperand(2);
21169 SDValue Chain = Op.getOperand(0);
21171 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21172 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21174 EVT MemVT = MemIntr->getMemoryVT();
21176 uint16_t TruncationOp = IntrData->Opc0;
21177 switch (TruncationOp) {
21178 case X86ISD::VTRUNC: {
21179 if (isAllOnesConstant(Mask)) // return just a truncate store
21180 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21181 MemIntr->getMemOperand());
21183 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21184 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21186 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21187 MemIntr->getMemOperand(), true /* truncating */);
21189 case X86ISD::VTRUNCUS:
21190 case X86ISD::VTRUNCS: {
21191 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21192 if (isAllOnesConstant(Mask))
21193 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21194 MemIntr->getMemOperand(), DAG);
21196 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21197 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21199 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21200 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21203 llvm_unreachable("Unsupported truncstore intrinsic");
21207 case EXPAND_FROM_MEM: {
21208 SDValue Mask = Op.getOperand(4);
21209 SDValue PassThru = Op.getOperand(3);
21210 SDValue Addr = Op.getOperand(2);
21211 SDValue Chain = Op.getOperand(0);
21212 MVT VT = Op.getSimpleValueType();
21214 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21215 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21217 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
21218 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
21219 if (X86::isZeroNode(Mask))
21220 return DAG.getUNDEF(VT);
21222 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21223 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21224 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
21225 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
21226 true /* expanding */);
21231 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21232 SelectionDAG &DAG) const {
21233 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21234 MFI.setReturnAddressIsTaken(true);
21236 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21239 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21241 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21244 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21245 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21246 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21247 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21248 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21249 MachinePointerInfo());
21252 // Just load the return address.
21253 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21254 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21255 MachinePointerInfo());
21258 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21259 SelectionDAG &DAG) const {
21260 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21261 return getReturnAddressFrameIndex(DAG);
21264 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21265 MachineFunction &MF = DAG.getMachineFunction();
21266 MachineFrameInfo &MFI = MF.getFrameInfo();
21267 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21268 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21269 EVT VT = Op.getValueType();
21271 MFI.setFrameAddressIsTaken(true);
21273 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21274 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21275 // is not possible to crawl up the stack without looking at the unwind codes
21277 int FrameAddrIndex = FuncInfo->getFAIndex();
21278 if (!FrameAddrIndex) {
21279 // Set up a frame object for the return address.
21280 unsigned SlotSize = RegInfo->getSlotSize();
21281 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21282 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21283 FuncInfo->setFAIndex(FrameAddrIndex);
21285 return DAG.getFrameIndex(FrameAddrIndex, VT);
21288 unsigned FrameReg =
21289 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21290 SDLoc dl(Op); // FIXME probably not meaningful
21291 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21292 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21293 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21294 "Invalid Frame Register!");
21295 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21297 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21298 MachinePointerInfo());
21302 // FIXME? Maybe this could be a TableGen attribute on some registers and
21303 // this table could be generated automatically from RegInfo.
21304 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21305 SelectionDAG &DAG) const {
21306 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21307 const MachineFunction &MF = DAG.getMachineFunction();
21309 unsigned Reg = StringSwitch<unsigned>(RegName)
21310 .Case("esp", X86::ESP)
21311 .Case("rsp", X86::RSP)
21312 .Case("ebp", X86::EBP)
21313 .Case("rbp", X86::RBP)
21316 if (Reg == X86::EBP || Reg == X86::RBP) {
21317 if (!TFI.hasFP(MF))
21318 report_fatal_error("register " + StringRef(RegName) +
21319 " is allocatable: function has no frame pointer");
21322 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21323 unsigned FrameReg =
21324 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21325 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21326 "Invalid Frame Register!");
21334 report_fatal_error("Invalid register name global variable");
21337 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21338 SelectionDAG &DAG) const {
21339 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21340 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21343 unsigned X86TargetLowering::getExceptionPointerRegister(
21344 const Constant *PersonalityFn) const {
21345 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21346 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21348 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21351 unsigned X86TargetLowering::getExceptionSelectorRegister(
21352 const Constant *PersonalityFn) const {
21353 // Funclet personalities don't use selectors (the runtime does the selection).
21354 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21355 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21358 bool X86TargetLowering::needsFixedCatchObjects() const {
21359 return Subtarget.isTargetWin64();
21362 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21363 SDValue Chain = Op.getOperand(0);
21364 SDValue Offset = Op.getOperand(1);
21365 SDValue Handler = Op.getOperand(2);
21368 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21369 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21370 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21371 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21372 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21373 "Invalid Frame Register!");
21374 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21375 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21377 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21378 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21380 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21381 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21382 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21384 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21385 DAG.getRegister(StoreAddrReg, PtrVT));
21388 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21389 SelectionDAG &DAG) const {
21391 // If the subtarget is not 64bit, we may need the global base reg
21392 // after isel expand pseudo, i.e., after CGBR pass ran.
21393 // Therefore, ask for the GlobalBaseReg now, so that the pass
21394 // inserts the code for us in case we need it.
21395 // Otherwise, we will end up in a situation where we will
21396 // reference a virtual register that is not defined!
21397 if (!Subtarget.is64Bit()) {
21398 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21399 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21401 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21402 DAG.getVTList(MVT::i32, MVT::Other),
21403 Op.getOperand(0), Op.getOperand(1));
21406 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21407 SelectionDAG &DAG) const {
21409 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21410 Op.getOperand(0), Op.getOperand(1));
21413 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21414 SelectionDAG &DAG) const {
21416 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21420 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21421 return Op.getOperand(0);
21424 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21425 SelectionDAG &DAG) const {
21426 SDValue Root = Op.getOperand(0);
21427 SDValue Trmp = Op.getOperand(1); // trampoline
21428 SDValue FPtr = Op.getOperand(2); // nested function
21429 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21432 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21433 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21435 if (Subtarget.is64Bit()) {
21436 SDValue OutChains[6];
21438 // Large code-model.
21439 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21440 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21442 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21443 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21445 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21447 // Load the pointer to the nested function into R11.
21448 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21449 SDValue Addr = Trmp;
21450 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21451 Addr, MachinePointerInfo(TrmpAddr));
21453 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21454 DAG.getConstant(2, dl, MVT::i64));
21456 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21457 /* Alignment = */ 2);
21459 // Load the 'nest' parameter value into R10.
21460 // R10 is specified in X86CallingConv.td
21461 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21462 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21463 DAG.getConstant(10, dl, MVT::i64));
21464 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21465 Addr, MachinePointerInfo(TrmpAddr, 10));
21467 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21468 DAG.getConstant(12, dl, MVT::i64));
21470 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21471 /* Alignment = */ 2);
21473 // Jump to the nested function.
21474 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21475 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21476 DAG.getConstant(20, dl, MVT::i64));
21477 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21478 Addr, MachinePointerInfo(TrmpAddr, 20));
21480 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21481 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21482 DAG.getConstant(22, dl, MVT::i64));
21483 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21484 Addr, MachinePointerInfo(TrmpAddr, 22));
21486 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21488 const Function *Func =
21489 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21490 CallingConv::ID CC = Func->getCallingConv();
21495 llvm_unreachable("Unsupported calling convention");
21496 case CallingConv::C:
21497 case CallingConv::X86_StdCall: {
21498 // Pass 'nest' parameter in ECX.
21499 // Must be kept in sync with X86CallingConv.td
21500 NestReg = X86::ECX;
21502 // Check that ECX wasn't needed by an 'inreg' parameter.
21503 FunctionType *FTy = Func->getFunctionType();
21504 const AttributeList &Attrs = Func->getAttributes();
21506 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21507 unsigned InRegCount = 0;
21510 for (FunctionType::param_iterator I = FTy->param_begin(),
21511 E = FTy->param_end(); I != E; ++I, ++Idx)
21512 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21513 auto &DL = DAG.getDataLayout();
21514 // FIXME: should only count parameters that are lowered to integers.
21515 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21518 if (InRegCount > 2) {
21519 report_fatal_error("Nest register in use - reduce number of inreg"
21525 case CallingConv::X86_FastCall:
21526 case CallingConv::X86_ThisCall:
21527 case CallingConv::Fast:
21528 // Pass 'nest' parameter in EAX.
21529 // Must be kept in sync with X86CallingConv.td
21530 NestReg = X86::EAX;
21534 SDValue OutChains[4];
21535 SDValue Addr, Disp;
21537 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21538 DAG.getConstant(10, dl, MVT::i32));
21539 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21541 // This is storing the opcode for MOV32ri.
21542 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21543 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21545 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21546 Trmp, MachinePointerInfo(TrmpAddr));
21548 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21549 DAG.getConstant(1, dl, MVT::i32));
21551 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21552 /* Alignment = */ 1);
21554 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21555 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21556 DAG.getConstant(5, dl, MVT::i32));
21557 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21558 Addr, MachinePointerInfo(TrmpAddr, 5),
21559 /* Alignment = */ 1);
21561 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21562 DAG.getConstant(6, dl, MVT::i32));
21564 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21565 /* Alignment = */ 1);
21567 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21571 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21572 SelectionDAG &DAG) const {
21574 The rounding mode is in bits 11:10 of FPSR, and has the following
21576 00 Round to nearest
21581 FLT_ROUNDS, on the other hand, expects the following:
21588 To perform the conversion, we do:
21589 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21592 MachineFunction &MF = DAG.getMachineFunction();
21593 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21594 unsigned StackAlignment = TFI.getStackAlignment();
21595 MVT VT = Op.getSimpleValueType();
21598 // Save FP Control Word to stack slot
21599 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21600 SDValue StackSlot =
21601 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21603 MachineMemOperand *MMO =
21604 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21605 MachineMemOperand::MOStore, 2, 2);
21607 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21608 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21609 DAG.getVTList(MVT::Other),
21610 Ops, MVT::i16, MMO);
21612 // Load FP Control Word from stack slot
21614 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21616 // Transform as necessary
21618 DAG.getNode(ISD::SRL, DL, MVT::i16,
21619 DAG.getNode(ISD::AND, DL, MVT::i16,
21620 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21621 DAG.getConstant(11, DL, MVT::i8));
21623 DAG.getNode(ISD::SRL, DL, MVT::i16,
21624 DAG.getNode(ISD::AND, DL, MVT::i16,
21625 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21626 DAG.getConstant(9, DL, MVT::i8));
21629 DAG.getNode(ISD::AND, DL, MVT::i16,
21630 DAG.getNode(ISD::ADD, DL, MVT::i16,
21631 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21632 DAG.getConstant(1, DL, MVT::i16)),
21633 DAG.getConstant(3, DL, MVT::i16));
21635 return DAG.getNode((VT.getSizeInBits() < 16 ?
21636 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21639 // Split an unary integer op into 2 half sized ops.
21640 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21641 MVT VT = Op.getSimpleValueType();
21642 unsigned NumElems = VT.getVectorNumElements();
21643 unsigned SizeInBits = VT.getSizeInBits();
21645 // Extract the Lo/Hi vectors
21647 SDValue Src = Op.getOperand(0);
21648 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21649 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21651 MVT EltVT = VT.getVectorElementType();
21652 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21653 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21654 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21655 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21658 // Decompose 256-bit ops into smaller 128-bit ops.
21659 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21660 assert(Op.getSimpleValueType().is256BitVector() &&
21661 Op.getSimpleValueType().isInteger() &&
21662 "Only handle AVX 256-bit vector integer operation");
21663 return LowerVectorIntUnary(Op, DAG);
21666 // Decompose 512-bit ops into smaller 256-bit ops.
21667 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21668 assert(Op.getSimpleValueType().is512BitVector() &&
21669 Op.getSimpleValueType().isInteger() &&
21670 "Only handle AVX 512-bit vector integer operation");
21671 return LowerVectorIntUnary(Op, DAG);
21674 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21676 // i8/i16 vector implemented using dword LZCNT vector instruction
21677 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21678 // split the vector, perform operation on it's Lo a Hi part and
21679 // concatenate the results.
21680 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21681 assert(Op.getOpcode() == ISD::CTLZ);
21683 MVT VT = Op.getSimpleValueType();
21684 MVT EltVT = VT.getVectorElementType();
21685 unsigned NumElems = VT.getVectorNumElements();
21687 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21688 "Unsupported element type");
21690 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21692 return LowerVectorIntUnary(Op, DAG);
21694 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21695 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21696 "Unsupported value type for operation");
21698 // Use native supported vector instruction vplzcntd.
21699 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21700 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21701 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21702 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21704 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21707 // Lower CTLZ using a PSHUFB lookup table implementation.
21708 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21709 const X86Subtarget &Subtarget,
21710 SelectionDAG &DAG) {
21711 MVT VT = Op.getSimpleValueType();
21712 int NumElts = VT.getVectorNumElements();
21713 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21714 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21716 // Per-nibble leading zero PSHUFB lookup table.
21717 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21718 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21719 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21720 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21722 SmallVector<SDValue, 64> LUTVec;
21723 for (int i = 0; i < NumBytes; ++i)
21724 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21725 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21727 // Begin by bitcasting the input to byte vector, then split those bytes
21728 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21729 // If the hi input nibble is zero then we add both results together, otherwise
21730 // we just take the hi result (by masking the lo result to zero before the
21732 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21733 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21735 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21736 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21737 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21738 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21740 if (CurrVT.is512BitVector()) {
21741 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21742 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
21743 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21745 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21748 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21749 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21750 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21751 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21753 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21754 // of the current vector width in the same way we did for the nibbles.
21755 // If the upper half of the input element is zero then add the halves'
21756 // leading zero counts together, otherwise just use the upper half's.
21757 // Double the width of the result until we are at target width.
21758 while (CurrVT != VT) {
21759 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21760 int CurrNumElts = CurrVT.getVectorNumElements();
21761 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21762 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21763 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21765 // Check if the upper half of the input element is zero.
21766 if (CurrVT.is512BitVector()) {
21767 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21768 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
21769 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21770 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21772 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21773 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21775 HiZ = DAG.getBitcast(NextVT, HiZ);
21777 // Move the upper/lower halves to the lower bits as we'll be extending to
21778 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21780 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21781 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21782 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21783 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21784 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21791 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21792 const X86Subtarget &Subtarget,
21793 SelectionDAG &DAG) {
21794 MVT VT = Op.getSimpleValueType();
21796 if (Subtarget.hasCDI())
21797 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21799 // Decompose 256-bit ops into smaller 128-bit ops.
21800 if (VT.is256BitVector() && !Subtarget.hasInt256())
21801 return Lower256IntUnary(Op, DAG);
21803 // Decompose 512-bit ops into smaller 256-bit ops.
21804 if (VT.is512BitVector() && !Subtarget.hasBWI())
21805 return Lower512IntUnary(Op, DAG);
21807 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21808 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21811 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21812 SelectionDAG &DAG) {
21813 MVT VT = Op.getSimpleValueType();
21815 unsigned NumBits = VT.getSizeInBits();
21817 unsigned Opc = Op.getOpcode();
21820 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21822 Op = Op.getOperand(0);
21823 if (VT == MVT::i8) {
21824 // Zero extend to i32 since there is not an i8 bsr.
21826 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21829 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21830 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21831 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21833 if (Opc == ISD::CTLZ) {
21834 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21837 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21838 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21841 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21844 // Finally xor with NumBits-1.
21845 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21846 DAG.getConstant(NumBits - 1, dl, OpVT));
21849 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21853 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21854 MVT VT = Op.getSimpleValueType();
21855 unsigned NumBits = VT.getScalarSizeInBits();
21858 if (VT.isVector()) {
21859 SDValue N0 = Op.getOperand(0);
21860 SDValue Zero = DAG.getConstant(0, dl, VT);
21862 // lsb(x) = (x & -x)
21863 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21864 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21866 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21867 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21868 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21869 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21870 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21873 // cttz(x) = ctpop(lsb - 1)
21874 SDValue One = DAG.getConstant(1, dl, VT);
21875 return DAG.getNode(ISD::CTPOP, dl, VT,
21876 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21879 assert(Op.getOpcode() == ISD::CTTZ &&
21880 "Only scalar CTTZ requires custom lowering");
21882 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21883 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21884 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21886 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21889 DAG.getConstant(NumBits, dl, VT),
21890 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21893 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21896 /// Break a 256-bit integer operation into two new 128-bit ones and then
21897 /// concatenate the result back.
21898 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21899 MVT VT = Op.getSimpleValueType();
21901 assert(VT.is256BitVector() && VT.isInteger() &&
21902 "Unsupported value type for operation");
21904 unsigned NumElems = VT.getVectorNumElements();
21907 // Extract the LHS vectors
21908 SDValue LHS = Op.getOperand(0);
21909 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21910 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21912 // Extract the RHS vectors
21913 SDValue RHS = Op.getOperand(1);
21914 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21915 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21917 MVT EltVT = VT.getVectorElementType();
21918 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21920 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21921 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21922 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21925 /// Break a 512-bit integer operation into two new 256-bit ones and then
21926 /// concatenate the result back.
21927 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21928 MVT VT = Op.getSimpleValueType();
21930 assert(VT.is512BitVector() && VT.isInteger() &&
21931 "Unsupported value type for operation");
21933 unsigned NumElems = VT.getVectorNumElements();
21936 // Extract the LHS vectors
21937 SDValue LHS = Op.getOperand(0);
21938 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21939 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21941 // Extract the RHS vectors
21942 SDValue RHS = Op.getOperand(1);
21943 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21944 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21946 MVT EltVT = VT.getVectorElementType();
21947 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21949 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21950 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21951 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21954 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21955 MVT VT = Op.getSimpleValueType();
21956 if (VT.getScalarType() == MVT::i1)
21957 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21958 Op.getOperand(0), Op.getOperand(1));
21959 assert(Op.getSimpleValueType().is256BitVector() &&
21960 Op.getSimpleValueType().isInteger() &&
21961 "Only handle AVX 256-bit vector integer operation");
21962 return Lower256IntArith(Op, DAG);
21965 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21966 MVT VT = Op.getSimpleValueType();
21967 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
21968 // Since X86 does not have CMOV for 8-bit integer, we don't convert
21969 // 8-bit integer abs to NEG and CMOV.
21971 SDValue N0 = Op.getOperand(0);
21972 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
21973 DAG.getConstant(0, DL, VT), N0);
21974 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
21975 SDValue(Neg.getNode(), 1)};
21976 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
21979 assert(Op.getSimpleValueType().is256BitVector() &&
21980 Op.getSimpleValueType().isInteger() &&
21981 "Only handle AVX 256-bit vector integer operation");
21982 return Lower256IntUnary(Op, DAG);
21985 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21986 assert(Op.getSimpleValueType().is256BitVector() &&
21987 Op.getSimpleValueType().isInteger() &&
21988 "Only handle AVX 256-bit vector integer operation");
21989 return Lower256IntArith(Op, DAG);
21992 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21993 SelectionDAG &DAG) {
21995 MVT VT = Op.getSimpleValueType();
21997 if (VT.getScalarType() == MVT::i1)
21998 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22000 // Decompose 256-bit ops into smaller 128-bit ops.
22001 if (VT.is256BitVector() && !Subtarget.hasInt256())
22002 return Lower256IntArith(Op, DAG);
22004 SDValue A = Op.getOperand(0);
22005 SDValue B = Op.getOperand(1);
22007 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22008 // vector pairs, multiply and truncate.
22009 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22010 if (Subtarget.hasInt256()) {
22011 // For 512-bit vectors, split into 256-bit vectors to allow the
22012 // sign-extension to occur.
22013 if (VT == MVT::v64i8)
22014 return Lower512IntArith(Op, DAG);
22016 // For 256-bit vectors, split into 128-bit vectors to allow the
22017 // sign-extension to occur. We don't need this on AVX512BW as we can
22018 // safely sign-extend to v32i16.
22019 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22020 return Lower256IntArith(Op, DAG);
22022 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22023 return DAG.getNode(
22024 ISD::TRUNCATE, dl, VT,
22025 DAG.getNode(ISD::MUL, dl, ExVT,
22026 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22027 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22030 assert(VT == MVT::v16i8 &&
22031 "Pre-AVX2 support only supports v16i8 multiplication");
22032 MVT ExVT = MVT::v8i16;
22034 // Extract the lo parts and sign extend to i16
22036 if (Subtarget.hasSSE41()) {
22037 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
22038 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
22040 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22041 -1, 4, -1, 5, -1, 6, -1, 7};
22042 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22043 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22044 ALo = DAG.getBitcast(ExVT, ALo);
22045 BLo = DAG.getBitcast(ExVT, BLo);
22046 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22047 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22050 // Extract the hi parts and sign extend to i16
22052 if (Subtarget.hasSSE41()) {
22053 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22054 -1, -1, -1, -1, -1, -1, -1, -1};
22055 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22056 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22057 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
22058 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
22060 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22061 -1, 12, -1, 13, -1, 14, -1, 15};
22062 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22063 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22064 AHi = DAG.getBitcast(ExVT, AHi);
22065 BHi = DAG.getBitcast(ExVT, BHi);
22066 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22067 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22070 // Multiply, mask the lower 8bits of the lo/hi results and pack
22071 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22072 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22073 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22074 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22075 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22078 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22079 if (VT == MVT::v4i32) {
22080 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22081 "Should not custom lower when pmuldq is available!");
22083 // Extract the odd parts.
22084 static const int UnpackMask[] = { 1, -1, 3, -1 };
22085 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22086 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22088 // Multiply the even parts.
22089 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
22090 // Now multiply odd parts.
22091 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
22093 Evens = DAG.getBitcast(VT, Evens);
22094 Odds = DAG.getBitcast(VT, Odds);
22096 // Merge the two vectors back together with a shuffle. This expands into 2
22098 static const int ShufMask[] = { 0, 4, 2, 6 };
22099 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22102 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22103 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22105 // 32-bit vector types used for MULDQ/MULUDQ.
22106 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22108 // MULDQ returns the 64-bit result of the signed multiplication of the lower
22109 // 32-bits. We can lower with this if the sign bits stretch that far.
22110 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
22111 DAG.ComputeNumSignBits(B) > 32) {
22112 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
22113 DAG.getBitcast(MulVT, B));
22116 // Ahi = psrlqi(a, 32);
22117 // Bhi = psrlqi(b, 32);
22119 // AloBlo = pmuludq(a, b);
22120 // AloBhi = pmuludq(a, Bhi);
22121 // AhiBlo = pmuludq(Ahi, b);
22123 // Hi = psllqi(AloBhi + AhiBlo, 32);
22124 // return AloBlo + Hi;
22125 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22126 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
22127 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
22129 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22130 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
22131 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
22133 // Bit cast to 32-bit vectors for MULUDQ.
22134 SDValue Alo = DAG.getBitcast(MulVT, A);
22135 SDValue Blo = DAG.getBitcast(MulVT, B);
22137 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22139 // Only multiply lo/hi halves that aren't known to be zero.
22140 SDValue AloBlo = Zero;
22141 if (!ALoIsZero && !BLoIsZero)
22142 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
22144 SDValue AloBhi = Zero;
22145 if (!ALoIsZero && !BHiIsZero) {
22146 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22147 Bhi = DAG.getBitcast(MulVT, Bhi);
22148 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
22151 SDValue AhiBlo = Zero;
22152 if (!AHiIsZero && !BLoIsZero) {
22153 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22154 Ahi = DAG.getBitcast(MulVT, Ahi);
22155 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
22158 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22159 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22161 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22164 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22165 SelectionDAG &DAG) {
22167 MVT VT = Op.getSimpleValueType();
22169 // Decompose 256-bit ops into smaller 128-bit ops.
22170 if (VT.is256BitVector() && !Subtarget.hasInt256())
22171 return Lower256IntArith(Op, DAG);
22173 // Only i8 vectors should need custom lowering after this.
22174 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22175 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22176 "Unsupported vector type");
22178 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22179 // logical shift down the upper half and pack back to i8.
22180 SDValue A = Op.getOperand(0);
22181 SDValue B = Op.getOperand(1);
22183 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22184 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22185 unsigned Opcode = Op.getOpcode();
22186 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22187 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22189 // For 512-bit vectors, split into 256-bit vectors to allow the
22190 // sign-extension to occur.
22191 if (VT == MVT::v64i8)
22192 return Lower512IntArith(Op, DAG);
22194 // AVX2 implementations - extend xmm subvectors to ymm.
22195 if (Subtarget.hasInt256()) {
22196 unsigned NumElems = VT.getVectorNumElements();
22197 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22198 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22200 if (VT == MVT::v32i8) {
22201 if (Subtarget.hasBWI()) {
22202 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22203 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22204 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22205 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22206 DAG.getConstant(8, dl, MVT::v32i16));
22207 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22209 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22210 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22211 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22212 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22213 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22214 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22215 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22216 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22217 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22218 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22219 DAG.getConstant(8, dl, MVT::v16i16));
22220 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22221 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22222 DAG.getConstant(8, dl, MVT::v16i16));
22223 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22224 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22225 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22226 16, 17, 18, 19, 20, 21, 22, 23};
22227 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22228 24, 25, 26, 27, 28, 29, 30, 31};
22229 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22230 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22231 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22234 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22235 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22236 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22237 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22238 DAG.getConstant(8, dl, MVT::v16i16));
22239 // If we have BWI we can use truncate instruction.
22240 if (Subtarget.hasBWI())
22241 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22242 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22243 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22244 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22247 assert(VT == MVT::v16i8 &&
22248 "Pre-AVX2 support only supports v16i8 multiplication");
22249 MVT ExVT = MVT::v8i16;
22250 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
22252 // Extract the lo parts and zero/sign extend to i16.
22254 if (Subtarget.hasSSE41()) {
22255 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
22256 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
22258 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22259 -1, 4, -1, 5, -1, 6, -1, 7};
22260 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22261 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22262 ALo = DAG.getBitcast(ExVT, ALo);
22263 BLo = DAG.getBitcast(ExVT, BLo);
22264 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22265 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22268 // Extract the hi parts and zero/sign extend to i16.
22270 if (Subtarget.hasSSE41()) {
22271 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22272 -1, -1, -1, -1, -1, -1, -1, -1};
22273 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22274 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22275 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
22276 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
22278 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22279 -1, 12, -1, 13, -1, 14, -1, 15};
22280 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22281 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22282 AHi = DAG.getBitcast(ExVT, AHi);
22283 BHi = DAG.getBitcast(ExVT, BHi);
22284 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22285 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22288 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22289 // pack back to v16i8.
22290 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22291 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22292 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22293 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22294 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22297 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22298 assert(Subtarget.isTargetWin64() && "Unexpected target");
22299 EVT VT = Op.getValueType();
22300 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22301 "Unexpected return type for lowering");
22305 switch (Op->getOpcode()) {
22306 default: llvm_unreachable("Unexpected request for libcall!");
22307 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22308 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22309 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22310 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22311 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22312 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22316 SDValue InChain = DAG.getEntryNode();
22318 TargetLowering::ArgListTy Args;
22319 TargetLowering::ArgListEntry Entry;
22320 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22321 EVT ArgVT = Op->getOperand(i).getValueType();
22322 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22323 "Unexpected argument type for lowering");
22324 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22325 Entry.Node = StackPtr;
22326 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22327 MachinePointerInfo(), /* Alignment = */ 16);
22328 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22329 Entry.Ty = PointerType::get(ArgTy,0);
22330 Entry.IsSExt = false;
22331 Entry.IsZExt = false;
22332 Args.push_back(Entry);
22335 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22336 getPointerTy(DAG.getDataLayout()));
22338 TargetLowering::CallLoweringInfo CLI(DAG);
22339 CLI.setDebugLoc(dl)
22342 getLibcallCallingConv(LC),
22343 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22346 .setSExtResult(isSigned)
22347 .setZExtResult(!isSigned);
22349 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22350 return DAG.getBitcast(VT, CallInfo.first);
22353 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22354 SelectionDAG &DAG) {
22355 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22356 MVT VT = Op0.getSimpleValueType();
22359 // Decompose 256-bit ops into smaller 128-bit ops.
22360 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22361 unsigned Opcode = Op.getOpcode();
22362 unsigned NumElems = VT.getVectorNumElements();
22363 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22364 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22365 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22366 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22367 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22368 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22369 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22371 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22372 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22374 return DAG.getMergeValues(Ops, dl);
22377 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22378 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22379 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22381 int NumElts = VT.getVectorNumElements();
22383 // PMULxD operations multiply each even value (starting at 0) of LHS with
22384 // the related value of RHS and produce a widen result.
22385 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22386 // => <2 x i64> <ae|cg>
22388 // In other word, to have all the results, we need to perform two PMULxD:
22389 // 1. one with the even values.
22390 // 2. one with the odd values.
22391 // To achieve #2, with need to place the odd values at an even position.
22393 // Place the odd value at an even position (basically, shift all values 1
22394 // step to the left):
22395 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22396 // <a|b|c|d> => <b|undef|d|undef>
22397 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22398 makeArrayRef(&Mask[0], NumElts));
22399 // <e|f|g|h> => <f|undef|h|undef>
22400 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22401 makeArrayRef(&Mask[0], NumElts));
22403 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22405 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22406 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22408 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22409 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22410 // => <2 x i64> <ae|cg>
22411 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
22412 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22413 // => <2 x i64> <bf|dh>
22414 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
22416 // Shuffle it back into the right order.
22417 SmallVector<int, 16> HighMask(NumElts);
22418 SmallVector<int, 16> LowMask(NumElts);
22419 for (int i = 0; i != NumElts; ++i) {
22420 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22421 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22424 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22425 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22427 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22428 // unsigned multiply.
22429 if (IsSigned && !Subtarget.hasSSE41()) {
22430 SDValue ShAmt = DAG.getConstant(
22432 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22433 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22434 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22435 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22436 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22438 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22439 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22442 // The first result of MUL_LOHI is actually the low value, followed by the
22444 SDValue Ops[] = {Lows, Highs};
22445 return DAG.getMergeValues(Ops, dl);
22448 // Return true if the required (according to Opcode) shift-imm form is natively
22449 // supported by the Subtarget
22450 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22452 if (VT.getScalarSizeInBits() < 16)
22455 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22456 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22459 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22460 (VT.is256BitVector() && Subtarget.hasInt256());
22462 bool AShift = LShift && (Subtarget.hasAVX512() ||
22463 (VT != MVT::v2i64 && VT != MVT::v4i64));
22464 return (Opcode == ISD::SRA) ? AShift : LShift;
22467 // The shift amount is a variable, but it is the same for all vector lanes.
22468 // These instructions are defined together with shift-immediate.
22470 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
22472 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
22475 // Return true if the required (according to Opcode) variable-shift form is
22476 // natively supported by the Subtarget
22477 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
22480 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
22483 // vXi16 supported only on AVX-512, BWI
22484 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
22487 if (Subtarget.hasAVX512())
22490 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22491 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22492 return (Opcode == ISD::SRA) ? AShift : LShift;
22495 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22496 const X86Subtarget &Subtarget) {
22497 MVT VT = Op.getSimpleValueType();
22499 SDValue R = Op.getOperand(0);
22500 SDValue Amt = Op.getOperand(1);
22502 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22503 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22505 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22506 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22507 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22508 SDValue Ex = DAG.getBitcast(ExVT, R);
22510 // ashr(R, 63) === cmp_slt(R, 0)
22511 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22512 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22513 "Unsupported PCMPGT op");
22514 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22515 getZeroVector(VT, Subtarget, DAG, dl), R);
22518 if (ShiftAmt >= 32) {
22519 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22521 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22522 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22523 ShiftAmt - 32, DAG);
22524 if (VT == MVT::v2i64)
22525 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22526 if (VT == MVT::v4i64)
22527 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22528 {9, 1, 11, 3, 13, 5, 15, 7});
22530 // SRA upper i32, SHL whole i64 and select lower i32.
22531 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22534 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22535 Lower = DAG.getBitcast(ExVT, Lower);
22536 if (VT == MVT::v2i64)
22537 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22538 if (VT == MVT::v4i64)
22539 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22540 {8, 1, 10, 3, 12, 5, 14, 7});
22542 return DAG.getBitcast(VT, Ex);
22545 // Optimize shl/srl/sra with constant shift amount.
22546 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22547 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22548 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22550 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22551 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22553 // i64 SRA needs to be performed as partial shifts.
22554 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22555 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22556 Op.getOpcode() == ISD::SRA)
22557 return ArithmeticShiftRight64(ShiftAmt);
22559 if (VT == MVT::v16i8 ||
22560 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22561 VT == MVT::v64i8) {
22562 unsigned NumElts = VT.getVectorNumElements();
22563 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22565 // Simple i8 add case
22566 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22567 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22569 // ashr(R, 7) === cmp_slt(R, 0)
22570 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22571 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22572 if (VT.is512BitVector()) {
22573 assert(VT == MVT::v64i8 && "Unexpected element type!");
22574 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22575 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22577 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22580 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22581 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22584 if (Op.getOpcode() == ISD::SHL) {
22585 // Make a large shift.
22586 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22588 SHL = DAG.getBitcast(VT, SHL);
22589 // Zero out the rightmost bits.
22590 return DAG.getNode(ISD::AND, dl, VT, SHL,
22591 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22593 if (Op.getOpcode() == ISD::SRL) {
22594 // Make a large shift.
22595 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22597 SRL = DAG.getBitcast(VT, SRL);
22598 // Zero out the leftmost bits.
22599 return DAG.getNode(ISD::AND, dl, VT, SRL,
22600 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22602 if (Op.getOpcode() == ISD::SRA) {
22603 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22604 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22606 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22607 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22608 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22611 llvm_unreachable("Unknown shift opcode.");
22616 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22617 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22618 if (!Subtarget.hasXOP() &&
22619 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22620 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22622 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22623 unsigned SubVectorScale = 1;
22624 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22626 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22627 Amt = Amt.getOperand(0);
22630 // Peek through any splat that was introduced for i64 shift vectorization.
22631 int SplatIndex = -1;
22632 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22633 if (SVN->isSplat()) {
22634 SplatIndex = SVN->getSplatIndex();
22635 Amt = Amt.getOperand(0);
22636 assert(SplatIndex < (int)VT.getVectorNumElements() &&
22637 "Splat shuffle referencing second operand");
22640 if (Amt.getOpcode() != ISD::BITCAST ||
22641 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22644 Amt = Amt.getOperand(0);
22645 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22646 (SubVectorScale * VT.getVectorNumElements());
22647 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22648 uint64_t ShiftAmt = 0;
22649 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22650 for (unsigned i = 0; i != Ratio; ++i) {
22651 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22655 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22658 // Check remaining shift amounts (if not a splat).
22659 if (SplatIndex < 0) {
22660 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22661 uint64_t ShAmt = 0;
22662 for (unsigned j = 0; j != Ratio; ++j) {
22663 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22667 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22669 if (ShAmt != ShiftAmt)
22674 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22675 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22677 if (Op.getOpcode() == ISD::SRA)
22678 return ArithmeticShiftRight64(ShiftAmt);
22684 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22685 const X86Subtarget &Subtarget) {
22686 MVT VT = Op.getSimpleValueType();
22688 SDValue R = Op.getOperand(0);
22689 SDValue Amt = Op.getOperand(1);
22691 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22692 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22694 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22695 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22697 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22699 MVT EltVT = VT.getVectorElementType();
22701 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22702 // Check if this build_vector node is doing a splat.
22703 // If so, then set BaseShAmt equal to the splat value.
22704 BaseShAmt = BV->getSplatValue();
22705 if (BaseShAmt && BaseShAmt.isUndef())
22706 BaseShAmt = SDValue();
22708 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22709 Amt = Amt.getOperand(0);
22711 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22712 if (SVN && SVN->isSplat()) {
22713 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22714 SDValue InVec = Amt.getOperand(0);
22715 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22716 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22717 "Unexpected shuffle index found!");
22718 BaseShAmt = InVec.getOperand(SplatIdx);
22719 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22720 if (ConstantSDNode *C =
22721 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22722 if (C->getZExtValue() == SplatIdx)
22723 BaseShAmt = InVec.getOperand(1);
22728 // Avoid introducing an extract element from a shuffle.
22729 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22730 DAG.getIntPtrConstant(SplatIdx, dl));
22734 if (BaseShAmt.getNode()) {
22735 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22736 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22737 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22738 else if (EltVT.bitsLT(MVT::i32))
22739 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22741 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22745 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22746 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
22747 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22748 Amt = Amt.getOperand(0);
22749 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22750 VT.getVectorNumElements();
22751 std::vector<SDValue> Vals(Ratio);
22752 for (unsigned i = 0; i != Ratio; ++i)
22753 Vals[i] = Amt.getOperand(i);
22754 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22755 for (unsigned j = 0; j != Ratio; ++j)
22756 if (Vals[j] != Amt.getOperand(i + j))
22760 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22761 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22766 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22767 SelectionDAG &DAG) {
22768 MVT VT = Op.getSimpleValueType();
22770 SDValue R = Op.getOperand(0);
22771 SDValue Amt = Op.getOperand(1);
22772 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22774 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22775 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22777 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22780 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22783 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22786 // XOP has 128-bit variable logical/arithmetic shifts.
22787 // +ve/-ve Amt = shift left/right.
22788 if (Subtarget.hasXOP() &&
22789 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22790 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22791 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22792 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22793 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22795 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22796 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22797 if (Op.getOpcode() == ISD::SRA)
22798 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22801 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22802 // shifts per-lane and then shuffle the partial results back together.
22803 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22804 // Splat the shift amounts so the scalar shifts above will catch it.
22805 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22806 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22807 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22808 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22809 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22812 // i64 vector arithmetic shift can be emulated with the transform:
22813 // M = lshr(SIGN_MASK, Amt)
22814 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22815 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22816 Op.getOpcode() == ISD::SRA) {
22817 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22818 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22819 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22820 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22821 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22825 // If possible, lower this packed shift into a vector multiply instead of
22826 // expanding it into a sequence of scalar shifts.
22827 // Do this only if the vector shift count is a constant build_vector.
22828 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22829 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22830 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22831 SmallVector<SDValue, 8> Elts;
22832 MVT SVT = VT.getVectorElementType();
22833 unsigned SVTBits = SVT.getSizeInBits();
22834 APInt One(SVTBits, 1);
22835 unsigned NumElems = VT.getVectorNumElements();
22837 for (unsigned i=0; i !=NumElems; ++i) {
22838 SDValue Op = Amt->getOperand(i);
22839 if (Op->isUndef()) {
22840 Elts.push_back(Op);
22844 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22845 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22846 uint64_t ShAmt = C.getZExtValue();
22847 if (ShAmt >= SVTBits) {
22848 Elts.push_back(DAG.getUNDEF(SVT));
22851 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22853 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22854 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22857 // Lower SHL with variable shift amount.
22858 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22859 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22861 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22862 DAG.getConstant(0x3f800000U, dl, VT));
22863 Op = DAG.getBitcast(MVT::v4f32, Op);
22864 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22865 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22868 // If possible, lower this shift as a sequence of two shifts by
22869 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22871 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22873 // Could be rewritten as:
22874 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22876 // The advantage is that the two shifts from the example would be
22877 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22878 // the vector shift into four scalar shifts plus four pairs of vector
22880 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22881 bool UseMOVSD = false;
22882 bool CanBeSimplified;
22883 // The splat value for the first packed shift (the 'X' from the example).
22884 SDValue Amt1 = Amt->getOperand(0);
22885 // The splat value for the second packed shift (the 'Y' from the example).
22886 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22888 // See if it is possible to replace this node with a sequence of
22889 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22890 if (VT == MVT::v4i32) {
22891 // Check if it is legal to use a MOVSS.
22892 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22893 Amt2 == Amt->getOperand(3);
22894 if (!CanBeSimplified) {
22895 // Otherwise, check if we can still simplify this node using a MOVSD.
22896 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22897 Amt->getOperand(2) == Amt->getOperand(3);
22899 Amt2 = Amt->getOperand(2);
22902 // Do similar checks for the case where the machine value type
22904 CanBeSimplified = Amt1 == Amt->getOperand(1);
22905 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22906 CanBeSimplified = Amt2 == Amt->getOperand(i);
22908 if (!CanBeSimplified) {
22910 CanBeSimplified = true;
22911 Amt2 = Amt->getOperand(4);
22912 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22913 CanBeSimplified = Amt1 == Amt->getOperand(i);
22914 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22915 CanBeSimplified = Amt2 == Amt->getOperand(j);
22919 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22920 isa<ConstantSDNode>(Amt2)) {
22921 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22923 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22924 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22926 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22927 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22928 SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
22929 SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
22931 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
22932 BitCast2, {0, 1, 6, 7}));
22933 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
22934 BitCast2, {0, 5, 6, 7}));
22938 // v4i32 Non Uniform Shifts.
22939 // If the shift amount is constant we can shift each lane using the SSE2
22940 // immediate shifts, else we need to zero-extend each lane to the lower i64
22941 // and shift using the SSE2 variable shifts.
22942 // The separate results can then be blended together.
22943 if (VT == MVT::v4i32) {
22944 unsigned Opc = Op.getOpcode();
22945 SDValue Amt0, Amt1, Amt2, Amt3;
22947 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22948 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22949 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22950 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22952 // ISD::SHL is handled above but we include it here for completeness.
22955 llvm_unreachable("Unknown target vector shift node");
22957 Opc = X86ISD::VSHL;
22960 Opc = X86ISD::VSRL;
22963 Opc = X86ISD::VSRA;
22966 // The SSE2 shifts use the lower i64 as the same shift amount for
22967 // all lanes and the upper i64 is ignored. These shuffle masks
22968 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22969 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22970 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22971 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22972 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22973 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22976 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22977 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22978 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22979 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22980 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22981 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22982 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22985 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22986 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22987 // make the existing SSE solution better.
22988 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22989 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22990 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22991 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22992 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22993 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22995 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22996 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22997 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22998 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22999 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23002 if (VT == MVT::v16i8 ||
23003 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23004 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23005 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23006 unsigned ShiftOpcode = Op->getOpcode();
23008 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23009 if (VT.is512BitVector()) {
23010 // On AVX512BW targets we make use of the fact that VSELECT lowers
23011 // to a masked blend which selects bytes based just on the sign bit
23012 // extracted to a mask.
23013 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23014 V0 = DAG.getBitcast(VT, V0);
23015 V1 = DAG.getBitcast(VT, V1);
23016 Sel = DAG.getBitcast(VT, Sel);
23017 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
23018 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23019 } else if (Subtarget.hasSSE41()) {
23020 // On SSE41 targets we make use of the fact that VSELECT lowers
23021 // to PBLENDVB which selects bytes based just on the sign bit.
23022 V0 = DAG.getBitcast(VT, V0);
23023 V1 = DAG.getBitcast(VT, V1);
23024 Sel = DAG.getBitcast(VT, Sel);
23025 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23027 // On pre-SSE41 targets we test for the sign bit by comparing to
23028 // zero - a negative value will set all bits of the lanes to true
23029 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23030 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23031 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23032 return DAG.getSelect(dl, SelVT, C, V0, V1);
23035 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23036 // We can safely do this using i16 shifts as we're only interested in
23037 // the 3 lower bits of each byte.
23038 Amt = DAG.getBitcast(ExtVT, Amt);
23039 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23040 Amt = DAG.getBitcast(VT, Amt);
23042 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23043 // r = VSELECT(r, shift(r, 4), a);
23045 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23046 R = SignBitSelect(VT, Amt, M, R);
23049 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23051 // r = VSELECT(r, shift(r, 2), a);
23052 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23053 R = SignBitSelect(VT, Amt, M, R);
23056 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23058 // return VSELECT(r, shift(r, 1), a);
23059 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23060 R = SignBitSelect(VT, Amt, M, R);
23064 if (Op->getOpcode() == ISD::SRA) {
23065 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23066 // so we can correctly sign extend. We don't care what happens to the
23068 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23069 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23070 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23071 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23072 ALo = DAG.getBitcast(ExtVT, ALo);
23073 AHi = DAG.getBitcast(ExtVT, AHi);
23074 RLo = DAG.getBitcast(ExtVT, RLo);
23075 RHi = DAG.getBitcast(ExtVT, RHi);
23077 // r = VSELECT(r, shift(r, 4), a);
23078 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23079 DAG.getConstant(4, dl, ExtVT));
23080 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23081 DAG.getConstant(4, dl, ExtVT));
23082 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23083 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23086 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23087 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23089 // r = VSELECT(r, shift(r, 2), a);
23090 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23091 DAG.getConstant(2, dl, ExtVT));
23092 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23093 DAG.getConstant(2, dl, ExtVT));
23094 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23095 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23098 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23099 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23101 // r = VSELECT(r, shift(r, 1), a);
23102 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23103 DAG.getConstant(1, dl, ExtVT));
23104 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23105 DAG.getConstant(1, dl, ExtVT));
23106 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23107 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23109 // Logical shift the result back to the lower byte, leaving a zero upper
23111 // meaning that we can safely pack with PACKUSWB.
23113 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23115 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23116 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23120 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23121 MVT ExtVT = MVT::v8i32;
23122 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23123 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23124 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23125 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23126 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23127 ALo = DAG.getBitcast(ExtVT, ALo);
23128 AHi = DAG.getBitcast(ExtVT, AHi);
23129 RLo = DAG.getBitcast(ExtVT, RLo);
23130 RHi = DAG.getBitcast(ExtVT, RHi);
23131 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23132 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23133 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23134 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23135 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23138 if (VT == MVT::v8i16) {
23139 unsigned ShiftOpcode = Op->getOpcode();
23141 // If we have a constant shift amount, the non-SSE41 path is best as
23142 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23143 bool UseSSE41 = Subtarget.hasSSE41() &&
23144 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23146 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23147 // On SSE41 targets we make use of the fact that VSELECT lowers
23148 // to PBLENDVB which selects bytes based just on the sign bit.
23150 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23151 V0 = DAG.getBitcast(ExtVT, V0);
23152 V1 = DAG.getBitcast(ExtVT, V1);
23153 Sel = DAG.getBitcast(ExtVT, Sel);
23154 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23156 // On pre-SSE41 targets we splat the sign bit - a negative value will
23157 // set all bits of the lanes to true and VSELECT uses that in
23158 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23160 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23161 return DAG.getSelect(dl, VT, C, V0, V1);
23164 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23166 // On SSE41 targets we need to replicate the shift mask in both
23167 // bytes for PBLENDVB.
23170 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23171 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23173 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23176 // r = VSELECT(r, shift(r, 8), a);
23177 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23178 R = SignBitSelect(Amt, M, R);
23181 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23183 // r = VSELECT(r, shift(r, 4), a);
23184 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23185 R = SignBitSelect(Amt, M, R);
23188 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23190 // r = VSELECT(r, shift(r, 2), a);
23191 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23192 R = SignBitSelect(Amt, M, R);
23195 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23197 // return VSELECT(r, shift(r, 1), a);
23198 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23199 R = SignBitSelect(Amt, M, R);
23203 // Decompose 256-bit shifts into smaller 128-bit shifts.
23204 if (VT.is256BitVector())
23205 return Lower256IntArith(Op, DAG);
23210 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23211 SelectionDAG &DAG) {
23212 MVT VT = Op.getSimpleValueType();
23214 SDValue R = Op.getOperand(0);
23215 SDValue Amt = Op.getOperand(1);
23216 unsigned Opcode = Op.getOpcode();
23217 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23219 if (Subtarget.hasAVX512()) {
23220 // Attempt to rotate by immediate.
23222 SmallVector<APInt, 16> EltBits;
23223 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23224 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23225 return EltBits[0] == V;
23227 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23228 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23229 return DAG.getNode(Op, DL, VT, R,
23230 DAG.getConstant(RotateAmt, DL, MVT::i8));
23234 // Else, fall-back on VPROLV/VPRORV.
23238 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23239 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
23240 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23242 // XOP has 128-bit vector variable + immediate rotates.
23243 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23245 // Split 256-bit integers.
23246 if (VT.is256BitVector())
23247 return Lower256IntArith(Op, DAG);
23249 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23251 // Attempt to rotate by immediate.
23252 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23253 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23254 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23255 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23256 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23257 DAG.getConstant(RotateAmt, DL, MVT::i8));
23261 // Use general rotate by variable (per-element).
23265 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23266 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23267 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23268 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23269 // has only one use.
23270 SDNode *N = Op.getNode();
23271 SDValue LHS = N->getOperand(0);
23272 SDValue RHS = N->getOperand(1);
23273 unsigned BaseOp = 0;
23274 X86::CondCode Cond;
23276 switch (Op.getOpcode()) {
23277 default: llvm_unreachable("Unknown ovf instruction!");
23279 // A subtract of one will be selected as a INC. Note that INC doesn't
23280 // set CF, so we can't do this for UADDO.
23281 if (isOneConstant(RHS)) {
23282 BaseOp = X86ISD::INC;
23283 Cond = X86::COND_O;
23286 BaseOp = X86ISD::ADD;
23287 Cond = X86::COND_O;
23290 BaseOp = X86ISD::ADD;
23291 Cond = X86::COND_B;
23294 // A subtract of one will be selected as a DEC. Note that DEC doesn't
23295 // set CF, so we can't do this for USUBO.
23296 if (isOneConstant(RHS)) {
23297 BaseOp = X86ISD::DEC;
23298 Cond = X86::COND_O;
23301 BaseOp = X86ISD::SUB;
23302 Cond = X86::COND_O;
23305 BaseOp = X86ISD::SUB;
23306 Cond = X86::COND_B;
23309 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
23310 Cond = X86::COND_O;
23312 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
23313 if (N->getValueType(0) == MVT::i8) {
23314 BaseOp = X86ISD::UMUL8;
23315 Cond = X86::COND_O;
23318 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
23320 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
23322 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
23324 if (N->getValueType(1) == MVT::i1)
23325 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23327 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23331 // Also sets EFLAGS.
23332 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
23333 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23335 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
23337 if (N->getValueType(1) == MVT::i1)
23338 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23340 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23343 /// Returns true if the operand type is exactly twice the native width, and
23344 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
23345 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
23346 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
23347 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
23348 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
23351 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
23352 else if (OpWidth == 128)
23353 return Subtarget.hasCmpxchg16b();
23358 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
23359 return needsCmpXchgNb(SI->getValueOperand()->getType());
23362 // Note: this turns large loads into lock cmpxchg8b/16b.
23363 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
23364 TargetLowering::AtomicExpansionKind
23365 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
23366 auto PTy = cast<PointerType>(LI->getPointerOperandType());
23367 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
23368 : AtomicExpansionKind::None;
23371 TargetLowering::AtomicExpansionKind
23372 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
23373 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23374 Type *MemType = AI->getType();
23376 // If the operand is too big, we must see if cmpxchg8/16b is available
23377 // and default to library calls otherwise.
23378 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
23379 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
23380 : AtomicExpansionKind::None;
23383 AtomicRMWInst::BinOp Op = AI->getOperation();
23386 llvm_unreachable("Unknown atomic operation");
23387 case AtomicRMWInst::Xchg:
23388 case AtomicRMWInst::Add:
23389 case AtomicRMWInst::Sub:
23390 // It's better to use xadd, xsub or xchg for these in all cases.
23391 return AtomicExpansionKind::None;
23392 case AtomicRMWInst::Or:
23393 case AtomicRMWInst::And:
23394 case AtomicRMWInst::Xor:
23395 // If the atomicrmw's result isn't actually used, we can just add a "lock"
23396 // prefix to a normal instruction for these operations.
23397 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
23398 : AtomicExpansionKind::None;
23399 case AtomicRMWInst::Nand:
23400 case AtomicRMWInst::Max:
23401 case AtomicRMWInst::Min:
23402 case AtomicRMWInst::UMax:
23403 case AtomicRMWInst::UMin:
23404 // These always require a non-trivial set of data operations on x86. We must
23405 // use a cmpxchg loop.
23406 return AtomicExpansionKind::CmpXChg;
23411 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
23412 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23413 Type *MemType = AI->getType();
23414 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
23415 // there is no benefit in turning such RMWs into loads, and it is actually
23416 // harmful as it introduces a mfence.
23417 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
23420 auto Builder = IRBuilder<>(AI);
23421 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
23422 auto SSID = AI->getSyncScopeID();
23423 // We must restrict the ordering to avoid generating loads with Release or
23424 // ReleaseAcquire orderings.
23425 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
23426 auto Ptr = AI->getPointerOperand();
23428 // Before the load we need a fence. Here is an example lifted from
23429 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
23432 // x.store(1, relaxed);
23433 // r1 = y.fetch_add(0, release);
23435 // y.fetch_add(42, acquire);
23436 // r2 = x.load(relaxed);
23437 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
23438 // lowered to just a load without a fence. A mfence flushes the store buffer,
23439 // making the optimization clearly correct.
23440 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
23441 // otherwise, we might be able to be more aggressive on relaxed idempotent
23442 // rmw. In practice, they do not look useful, so we don't try to be
23443 // especially clever.
23444 if (SSID == SyncScope::SingleThread)
23445 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
23446 // the IR level, so we must wrap it in an intrinsic.
23449 if (!Subtarget.hasMFence())
23450 // FIXME: it might make sense to use a locked operation here but on a
23451 // different cache-line to prevent cache-line bouncing. In practice it
23452 // is probably a small win, and x86 processors without mfence are rare
23453 // enough that we do not bother.
23457 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
23458 Builder.CreateCall(MFence, {});
23460 // Finally we can emit the atomic load.
23461 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
23462 AI->getType()->getPrimitiveSizeInBits());
23463 Loaded->setAtomic(Order, SSID);
23464 AI->replaceAllUsesWith(Loaded);
23465 AI->eraseFromParent();
23469 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
23470 SelectionDAG &DAG) {
23472 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
23473 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
23474 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
23475 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
23477 // The only fence that needs an instruction is a sequentially-consistent
23478 // cross-thread fence.
23479 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
23480 FenceSSID == SyncScope::System) {
23481 if (Subtarget.hasMFence())
23482 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
23484 SDValue Chain = Op.getOperand(0);
23485 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
23487 DAG.getRegister(X86::ESP, MVT::i32), // Base
23488 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
23489 DAG.getRegister(0, MVT::i32), // Index
23490 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
23491 DAG.getRegister(0, MVT::i32), // Segment.
23495 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23496 return SDValue(Res, 0);
23499 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23500 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23503 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23504 SelectionDAG &DAG) {
23505 MVT T = Op.getSimpleValueType();
23509 switch(T.SimpleTy) {
23510 default: llvm_unreachable("Invalid value type!");
23511 case MVT::i8: Reg = X86::AL; size = 1; break;
23512 case MVT::i16: Reg = X86::AX; size = 2; break;
23513 case MVT::i32: Reg = X86::EAX; size = 4; break;
23515 assert(Subtarget.is64Bit() && "Node not type legal!");
23516 Reg = X86::RAX; size = 8;
23519 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23520 Op.getOperand(2), SDValue());
23521 SDValue Ops[] = { cpIn.getValue(0),
23524 DAG.getTargetConstant(size, DL, MVT::i8),
23525 cpIn.getValue(1) };
23526 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23527 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23528 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23532 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23533 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23534 MVT::i32, cpOut.getValue(2));
23535 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23537 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23538 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23539 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23543 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23544 SelectionDAG &DAG) {
23545 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23546 MVT DstVT = Op.getSimpleValueType();
23548 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23549 SrcVT == MVT::i64) {
23550 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23551 if (DstVT != MVT::f64)
23552 // This conversion needs to be expanded.
23555 SDValue Op0 = Op->getOperand(0);
23556 SmallVector<SDValue, 16> Elts;
23560 if (SrcVT.isVector()) {
23561 NumElts = SrcVT.getVectorNumElements();
23562 SVT = SrcVT.getVectorElementType();
23564 // Widen the vector in input in the case of MVT::v2i32.
23565 // Example: from MVT::v2i32 to MVT::v4i32.
23566 for (unsigned i = 0, e = NumElts; i != e; ++i)
23567 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23568 DAG.getIntPtrConstant(i, dl)));
23570 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23571 "Unexpected source type in LowerBITCAST");
23572 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23573 DAG.getIntPtrConstant(0, dl)));
23574 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23575 DAG.getIntPtrConstant(1, dl)));
23579 // Explicitly mark the extra elements as Undef.
23580 Elts.append(NumElts, DAG.getUNDEF(SVT));
23582 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23583 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23584 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23585 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23586 DAG.getIntPtrConstant(0, dl));
23589 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23590 Subtarget.hasMMX() && "Unexpected custom BITCAST");
23591 assert((DstVT == MVT::i64 ||
23592 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23593 "Unexpected custom BITCAST");
23594 // i64 <=> MMX conversions are Legal.
23595 if (SrcVT==MVT::i64 && DstVT.isVector())
23597 if (DstVT==MVT::i64 && SrcVT.isVector())
23599 // MMX <=> MMX conversions are Legal.
23600 if (SrcVT.isVector() && DstVT.isVector())
23602 // All other conversions need to be expanded.
23606 /// Compute the horizontal sum of bytes in V for the elements of VT.
23608 /// Requires V to be a byte vector and VT to be an integer vector type with
23609 /// wider elements than V's type. The width of the elements of VT determines
23610 /// how many bytes of V are summed horizontally to produce each element of the
23612 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23613 const X86Subtarget &Subtarget,
23614 SelectionDAG &DAG) {
23616 MVT ByteVecVT = V.getSimpleValueType();
23617 MVT EltVT = VT.getVectorElementType();
23618 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23619 "Expected value to have byte element type.");
23620 assert(EltVT != MVT::i8 &&
23621 "Horizontal byte sum only makes sense for wider elements!");
23622 unsigned VecSize = VT.getSizeInBits();
23623 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23625 // PSADBW instruction horizontally add all bytes and leave the result in i64
23626 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23627 if (EltVT == MVT::i64) {
23628 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23629 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23630 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23631 return DAG.getBitcast(VT, V);
23634 if (EltVT == MVT::i32) {
23635 // We unpack the low half and high half into i32s interleaved with zeros so
23636 // that we can use PSADBW to horizontally sum them. The most useful part of
23637 // this is that it lines up the results of two PSADBW instructions to be
23638 // two v2i64 vectors which concatenated are the 4 population counts. We can
23639 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23640 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23641 SDValue V32 = DAG.getBitcast(VT, V);
23642 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23643 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23645 // Do the horizontal sums into two v2i64s.
23646 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23647 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23648 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23649 DAG.getBitcast(ByteVecVT, Low), Zeros);
23650 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23651 DAG.getBitcast(ByteVecVT, High), Zeros);
23653 // Merge them together.
23654 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23655 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23656 DAG.getBitcast(ShortVecVT, Low),
23657 DAG.getBitcast(ShortVecVT, High));
23659 return DAG.getBitcast(VT, V);
23662 // The only element type left is i16.
23663 assert(EltVT == MVT::i16 && "Unknown how to handle type");
23665 // To obtain pop count for each i16 element starting from the pop count for
23666 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23667 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23668 // directly supported.
23669 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23670 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23671 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23672 DAG.getBitcast(ByteVecVT, V));
23673 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23676 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23677 const X86Subtarget &Subtarget,
23678 SelectionDAG &DAG) {
23679 MVT VT = Op.getSimpleValueType();
23680 MVT EltVT = VT.getVectorElementType();
23681 unsigned VecSize = VT.getSizeInBits();
23683 // Implement a lookup table in register by using an algorithm based on:
23684 // http://wm.ite.pl/articles/sse-popcount.html
23686 // The general idea is that every lower byte nibble in the input vector is an
23687 // index into a in-register pre-computed pop count table. We then split up the
23688 // input vector in two new ones: (1) a vector with only the shifted-right
23689 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23690 // masked out higher ones) for each byte. PSHUFB is used separately with both
23691 // to index the in-register table. Next, both are added and the result is a
23692 // i8 vector where each element contains the pop count for input byte.
23694 // To obtain the pop count for elements != i8, we follow up with the same
23695 // approach and use additional tricks as described below.
23697 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23698 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23699 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23700 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23702 int NumByteElts = VecSize / 8;
23703 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23704 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23705 SmallVector<SDValue, 64> LUTVec;
23706 for (int i = 0; i < NumByteElts; ++i)
23707 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23708 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23709 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23712 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23713 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23716 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23718 // The input vector is used as the shuffle mask that index elements into the
23719 // LUT. After counting low and high nibbles, add the vector to obtain the
23720 // final pop count per i8 element.
23721 SDValue HighPopCnt =
23722 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23723 SDValue LowPopCnt =
23724 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23725 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23727 if (EltVT == MVT::i8)
23730 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23733 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23734 const X86Subtarget &Subtarget,
23735 SelectionDAG &DAG) {
23736 MVT VT = Op.getSimpleValueType();
23737 assert(VT.is128BitVector() &&
23738 "Only 128-bit vector bitmath lowering supported.");
23740 int VecSize = VT.getSizeInBits();
23741 MVT EltVT = VT.getVectorElementType();
23742 int Len = EltVT.getSizeInBits();
23744 // This is the vectorized version of the "best" algorithm from
23745 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23746 // with a minor tweak to use a series of adds + shifts instead of vector
23747 // multiplications. Implemented for all integer vector types. We only use
23748 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23749 // much faster, even faster than using native popcnt instructions.
23751 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23752 MVT VT = V.getSimpleValueType();
23753 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23754 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23756 auto GetMask = [&](SDValue V, APInt Mask) {
23757 MVT VT = V.getSimpleValueType();
23758 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23759 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23762 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23763 // x86, so set the SRL type to have elements at least i16 wide. This is
23764 // correct because all of our SRLs are followed immediately by a mask anyways
23765 // that handles any bits that sneak into the high bits of the byte elements.
23766 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23770 // v = v - ((v >> 1) & 0x55555555...)
23772 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23773 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23774 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23776 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23777 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23778 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23779 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23780 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23782 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23783 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23784 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23785 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23787 // At this point, V contains the byte-wise population count, and we are
23788 // merely doing a horizontal sum if necessary to get the wider element
23790 if (EltVT == MVT::i8)
23793 return LowerHorizontalByteSum(
23794 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23798 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23799 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23800 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23801 SelectionDAG &DAG) {
23802 MVT VT = Op.getSimpleValueType();
23803 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23804 "Unknown CTPOP type to handle");
23805 SDLoc DL(Op.getNode());
23806 SDValue Op0 = Op.getOperand(0);
23808 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23809 if (Subtarget.hasVPOPCNTDQ()) {
23810 unsigned NumElems = VT.getVectorNumElements();
23811 assert((VT.getVectorElementType() == MVT::i8 ||
23812 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
23813 if (NumElems <= 16) {
23814 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
23815 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
23816 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
23817 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
23821 if (!Subtarget.hasSSSE3()) {
23822 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23823 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23824 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23827 // Decompose 256-bit ops into smaller 128-bit ops.
23828 if (VT.is256BitVector() && !Subtarget.hasInt256())
23829 return Lower256IntUnary(Op, DAG);
23831 // Decompose 512-bit ops into smaller 256-bit ops.
23832 if (VT.is512BitVector() && !Subtarget.hasBWI())
23833 return Lower512IntUnary(Op, DAG);
23835 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23838 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23839 SelectionDAG &DAG) {
23840 assert(Op.getSimpleValueType().isVector() &&
23841 "We only do custom lowering for vector population count.");
23842 return LowerVectorCTPOP(Op, Subtarget, DAG);
23845 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23846 MVT VT = Op.getSimpleValueType();
23847 SDValue In = Op.getOperand(0);
23850 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23851 // perform the BITREVERSE.
23852 if (!VT.isVector()) {
23853 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23854 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23855 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23856 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23857 DAG.getIntPtrConstant(0, DL));
23860 int NumElts = VT.getVectorNumElements();
23861 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23863 // Decompose 256-bit ops into smaller 128-bit ops.
23864 if (VT.is256BitVector())
23865 return Lower256IntUnary(Op, DAG);
23867 assert(VT.is128BitVector() &&
23868 "Only 128-bit vector bitreverse lowering supported.");
23870 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23871 // perform the BSWAP in the shuffle.
23872 // Its best to shuffle using the second operand as this will implicitly allow
23873 // memory folding for multiple vectors.
23874 SmallVector<SDValue, 16> MaskElts;
23875 for (int i = 0; i != NumElts; ++i) {
23876 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23877 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23878 int PermuteByte = SourceByte | (2 << 5);
23879 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23883 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23884 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23885 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23887 return DAG.getBitcast(VT, Res);
23890 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23891 SelectionDAG &DAG) {
23892 MVT VT = Op.getSimpleValueType();
23894 if (Subtarget.hasXOP() && !VT.is512BitVector())
23895 return LowerBITREVERSE_XOP(Op, DAG);
23897 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23899 SDValue In = Op.getOperand(0);
23902 unsigned NumElts = VT.getVectorNumElements();
23903 assert(VT.getScalarType() == MVT::i8 &&
23904 "Only byte vector BITREVERSE supported");
23906 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23907 if (VT.is256BitVector() && !Subtarget.hasInt256())
23908 return Lower256IntUnary(Op, DAG);
23910 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23911 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23912 // 0-15 value (moved to the other nibble).
23913 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23914 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23915 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23917 const int LoLUT[16] = {
23918 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23919 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23920 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23921 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23922 const int HiLUT[16] = {
23923 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23924 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23925 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23926 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23928 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23929 for (unsigned i = 0; i < NumElts; ++i) {
23930 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23931 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23934 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23935 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23936 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23937 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23938 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23941 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
23942 const X86Subtarget &Subtarget,
23943 bool AllowIncDec = true) {
23944 unsigned NewOpc = 0;
23945 switch (N->getOpcode()) {
23946 case ISD::ATOMIC_LOAD_ADD:
23947 NewOpc = X86ISD::LADD;
23949 case ISD::ATOMIC_LOAD_SUB:
23950 NewOpc = X86ISD::LSUB;
23952 case ISD::ATOMIC_LOAD_OR:
23953 NewOpc = X86ISD::LOR;
23955 case ISD::ATOMIC_LOAD_XOR:
23956 NewOpc = X86ISD::LXOR;
23958 case ISD::ATOMIC_LOAD_AND:
23959 NewOpc = X86ISD::LAND;
23962 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23965 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23967 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
23968 // Convert to inc/dec if they aren't slow or we are optimizing for size.
23969 if (AllowIncDec && (!Subtarget.slowIncDec() ||
23970 DAG.getMachineFunction().getFunction().optForSize())) {
23971 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
23972 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
23973 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
23974 DAG.getVTList(MVT::i32, MVT::Other),
23975 {N->getOperand(0), N->getOperand(1)},
23976 /*MemVT=*/N->getSimpleValueType(0), MMO);
23977 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
23978 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
23979 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
23980 DAG.getVTList(MVT::i32, MVT::Other),
23981 {N->getOperand(0), N->getOperand(1)},
23982 /*MemVT=*/N->getSimpleValueType(0), MMO);
23986 return DAG.getMemIntrinsicNode(
23987 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23988 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23989 /*MemVT=*/N->getSimpleValueType(0), MMO);
23992 /// Lower atomic_load_ops into LOCK-prefixed operations.
23993 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23994 const X86Subtarget &Subtarget) {
23995 SDValue Chain = N->getOperand(0);
23996 SDValue LHS = N->getOperand(1);
23997 SDValue RHS = N->getOperand(2);
23998 unsigned Opc = N->getOpcode();
23999 MVT VT = N->getSimpleValueType(0);
24002 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24003 // can only be lowered when the result is unused. They should have already
24004 // been transformed into a cmpxchg loop in AtomicExpand.
24005 if (N->hasAnyUseOfValue(0)) {
24006 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24007 // select LXADD if LOCK_SUB can't be selected.
24008 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24009 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24010 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24011 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24012 RHS, AN->getMemOperand());
24014 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24015 "Used AtomicRMW ops other than Add should have been expanded!");
24019 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24020 // RAUW the chain, but don't worry about the result, as it's unused.
24021 assert(!N->hasAnyUseOfValue(0));
24022 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24026 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24027 SDNode *Node = Op.getNode();
24029 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24031 // Convert seq_cst store -> xchg
24032 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24033 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24034 // (The only way to get a 16-byte store is cmpxchg16b)
24035 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24036 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24037 AtomicOrdering::SequentiallyConsistent ||
24038 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24039 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24040 cast<AtomicSDNode>(Node)->getMemoryVT(),
24041 Node->getOperand(0),
24042 Node->getOperand(1), Node->getOperand(2),
24043 cast<AtomicSDNode>(Node)->getMemOperand());
24044 return Swap.getValue(1);
24046 // Other atomic stores have a simple pattern.
24050 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24051 SDNode *N = Op.getNode();
24052 MVT VT = N->getSimpleValueType(0);
24054 // Let legalize expand this if it isn't a legal type yet.
24055 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24058 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24061 // Set the carry flag.
24062 SDValue Carry = Op.getOperand(2);
24063 EVT CarryVT = Carry.getValueType();
24064 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24065 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24066 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24068 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24069 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24070 Op.getOperand(1), Carry.getValue(1));
24072 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24073 if (N->getValueType(1) == MVT::i1)
24074 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24076 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24079 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24080 SelectionDAG &DAG) {
24081 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24083 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24084 // which returns the values as { float, float } (in XMM0) or
24085 // { double, double } (which is returned in XMM0, XMM1).
24087 SDValue Arg = Op.getOperand(0);
24088 EVT ArgVT = Arg.getValueType();
24089 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24091 TargetLowering::ArgListTy Args;
24092 TargetLowering::ArgListEntry Entry;
24096 Entry.IsSExt = false;
24097 Entry.IsZExt = false;
24098 Args.push_back(Entry);
24100 bool isF64 = ArgVT == MVT::f64;
24101 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24102 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24103 // the results are returned via SRet in memory.
24104 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
24105 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24107 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24109 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24110 : (Type *)VectorType::get(ArgTy, 4);
24112 TargetLowering::CallLoweringInfo CLI(DAG);
24113 CLI.setDebugLoc(dl)
24114 .setChain(DAG.getEntryNode())
24115 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24117 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24120 // Returned in xmm0 and xmm1.
24121 return CallResult.first;
24123 // Returned in bits 0:31 and 32:64 xmm0.
24124 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24125 CallResult.first, DAG.getIntPtrConstant(0, dl));
24126 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24127 CallResult.first, DAG.getIntPtrConstant(1, dl));
24128 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24129 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24132 /// Widen a vector input to a vector of NVT. The
24133 /// input vector must have the same element type as NVT.
24134 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24135 bool FillWithZeroes = false) {
24136 // Check if InOp already has the right width.
24137 MVT InVT = InOp.getSimpleValueType();
24141 if (InOp.isUndef())
24142 return DAG.getUNDEF(NVT);
24144 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24145 "input and widen element type must match");
24147 unsigned InNumElts = InVT.getVectorNumElements();
24148 unsigned WidenNumElts = NVT.getVectorNumElements();
24149 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24150 "Unexpected request for vector widening");
24153 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24154 InOp.getNumOperands() == 2) {
24155 SDValue N1 = InOp.getOperand(1);
24156 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24158 InOp = InOp.getOperand(0);
24159 InVT = InOp.getSimpleValueType();
24160 InNumElts = InVT.getVectorNumElements();
24163 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24164 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24165 SmallVector<SDValue, 16> Ops;
24166 for (unsigned i = 0; i < InNumElts; ++i)
24167 Ops.push_back(InOp.getOperand(i));
24169 EVT EltVT = InOp.getOperand(0).getValueType();
24171 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24172 DAG.getUNDEF(EltVT);
24173 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24174 Ops.push_back(FillVal);
24175 return DAG.getBuildVector(NVT, dl, Ops);
24177 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24179 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24180 InOp, DAG.getIntPtrConstant(0, dl));
24183 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24184 SelectionDAG &DAG) {
24185 assert(Subtarget.hasAVX512() &&
24186 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24188 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24189 SDValue Src = N->getValue();
24190 MVT VT = Src.getSimpleValueType();
24191 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24194 SDValue Index = N->getIndex();
24195 SDValue Mask = N->getMask();
24196 SDValue Chain = N->getChain();
24197 SDValue BasePtr = N->getBasePtr();
24198 MVT MemVT = N->getMemoryVT().getSimpleVT();
24199 MVT IndexVT = Index.getSimpleValueType();
24200 MVT MaskVT = Mask.getSimpleValueType();
24202 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
24203 // The v2i32 value was promoted to v2i64.
24204 // Now we "redo" the type legalizer's work and widen the original
24205 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
24207 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
24208 "Unexpected memory type");
24209 int ShuffleMask[] = {0, 2, -1, -1};
24210 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
24211 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
24212 // Now we have 4 elements instead of 2.
24213 // Expand the index.
24214 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
24215 Index = ExtendToType(Index, NewIndexVT, DAG);
24217 // Expand the mask with zeroes
24218 // Mask may be <2 x i64> or <2 x i1> at this moment
24219 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
24220 "Unexpected mask type");
24221 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
24222 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24226 unsigned NumElts = VT.getVectorNumElements();
24227 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24228 !Index.getSimpleValueType().is512BitVector()) {
24229 // AVX512F supports only 512-bit vectors. Or data or index should
24230 // be 512 bit wide. If now the both index and data are 256-bit, but
24231 // the vector contains 8 elements, we just sign-extend the index
24232 if (IndexVT == MVT::v8i32)
24233 // Just extend index
24234 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24236 // The minimal number of elts in scatter is 8
24239 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24240 // Use original index here, do not modify the index twice
24241 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
24242 if (IndexVT.getScalarType() == MVT::i32)
24243 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24246 // At this point we have promoted mask operand
24247 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
24248 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24249 // Use the original mask here, do not modify the mask twice
24250 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
24252 // The value that should be stored
24253 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24254 Src = ExtendToType(Src, NewVT, DAG);
24257 // If the mask is "wide" at this point - truncate it to i1 vector
24258 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24259 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
24261 // The mask is killed by scatter, add it to the values
24262 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
24263 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
24264 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24265 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24266 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24267 return SDValue(NewScatter.getNode(), 1);
24270 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
24271 SelectionDAG &DAG) {
24273 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
24274 MVT VT = Op.getSimpleValueType();
24275 MVT ScalarVT = VT.getScalarType();
24276 SDValue Mask = N->getMask();
24279 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
24280 "Expanding masked load is supported on AVX-512 target only!");
24282 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
24283 "Expanding masked load is supported for 32 and 64-bit types only!");
24285 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
24286 // VLX. These types for exp-loads are handled here.
24287 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
24290 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24291 "Cannot lower masked load op.");
24293 assert((ScalarVT.getSizeInBits() >= 32 ||
24294 (Subtarget.hasBWI() &&
24295 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24296 "Unsupported masked load op.");
24298 // This operation is legal for targets with VLX, but without
24299 // VLX the vector should be widened to 512 bit
24300 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
24301 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24302 SDValue Src0 = N->getSrc0();
24303 Src0 = ExtendToType(Src0, WideDataVT, DAG);
24305 // Mask element has to be i1.
24306 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24307 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
24308 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
24310 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24312 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24313 if (MaskEltTy != MVT::i1)
24314 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24315 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24316 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
24317 N->getBasePtr(), Mask, Src0,
24318 N->getMemoryVT(), N->getMemOperand(),
24319 N->getExtensionType(),
24320 N->isExpandingLoad());
24322 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24323 NewLoad.getValue(0),
24324 DAG.getIntPtrConstant(0, dl));
24325 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
24326 return DAG.getMergeValues(RetOps, dl);
24329 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
24330 SelectionDAG &DAG) {
24331 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
24332 SDValue DataToStore = N->getValue();
24333 MVT VT = DataToStore.getSimpleValueType();
24334 MVT ScalarVT = VT.getScalarType();
24335 SDValue Mask = N->getMask();
24338 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
24339 "Expanding masked load is supported on AVX-512 target only!");
24341 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
24342 "Expanding masked load is supported for 32 and 64-bit types only!");
24344 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
24345 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
24348 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24349 "Cannot lower masked store op.");
24351 assert((ScalarVT.getSizeInBits() >= 32 ||
24352 (Subtarget.hasBWI() &&
24353 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24354 "Unsupported masked store op.");
24356 // This operation is legal for targets with VLX, but without
24357 // VLX the vector should be widened to 512 bit
24358 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
24359 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24361 // Mask element has to be i1.
24362 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24363 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
24364 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
24366 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24368 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
24369 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24370 if (MaskEltTy != MVT::i1)
24371 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24372 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24373 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
24374 Mask, N->getMemoryVT(), N->getMemOperand(),
24375 N->isTruncatingStore(), N->isCompressingStore());
24378 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
24379 SelectionDAG &DAG) {
24380 assert(Subtarget.hasAVX2() &&
24381 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
24383 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
24385 MVT VT = Op.getSimpleValueType();
24386 SDValue Index = N->getIndex();
24387 SDValue Mask = N->getMask();
24388 SDValue Src0 = N->getValue();
24389 MVT IndexVT = Index.getSimpleValueType();
24390 MVT MaskVT = Mask.getSimpleValueType();
24392 unsigned NumElts = VT.getVectorNumElements();
24393 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
24395 // If the index is v2i32, we're being called by type legalization.
24396 if (IndexVT == MVT::v2i32)
24399 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24400 !Index.getSimpleValueType().is512BitVector()) {
24401 // AVX512F supports only 512-bit vectors. Or data or index should
24402 // be 512 bit wide. If now the both index and data are 256-bit, but
24403 // the vector contains 8 elements, we just sign-extend the index
24404 if (NumElts == 8) {
24405 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24406 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24407 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24408 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24409 N->getMemOperand());
24410 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24413 // Minimal number of elements in Gather
24416 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24417 Index = ExtendToType(Index, NewIndexVT, DAG);
24418 if (IndexVT.getScalarType() == MVT::i32)
24419 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24422 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
24423 // At this point we have promoted mask operand
24424 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
24425 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24426 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24427 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
24429 // The pass-through value
24430 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24431 Src0 = ExtendToType(Src0, NewVT, DAG);
24433 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24434 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24435 DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24436 N->getMemOperand());
24437 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24438 NewGather.getValue(0),
24439 DAG.getIntPtrConstant(0, dl));
24440 SDValue RetOps[] = {Extract, NewGather.getValue(2)};
24441 return DAG.getMergeValues(RetOps, dl);
24444 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24445 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24446 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24447 N->getMemOperand());
24448 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24451 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
24452 SelectionDAG &DAG) const {
24453 // TODO: Eventually, the lowering of these nodes should be informed by or
24454 // deferred to the GC strategy for the function in which they appear. For
24455 // now, however, they must be lowered to something. Since they are logically
24456 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24457 // require special handling for these nodes), lower them as literal NOOPs for
24459 SmallVector<SDValue, 2> Ops;
24461 Ops.push_back(Op.getOperand(0));
24462 if (Op->getGluedNode())
24463 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24466 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24467 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24472 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24473 SelectionDAG &DAG) const {
24474 // TODO: Eventually, the lowering of these nodes should be informed by or
24475 // deferred to the GC strategy for the function in which they appear. For
24476 // now, however, they must be lowered to something. Since they are logically
24477 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24478 // require special handling for these nodes), lower them as literal NOOPs for
24480 SmallVector<SDValue, 2> Ops;
24482 Ops.push_back(Op.getOperand(0));
24483 if (Op->getGluedNode())
24484 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24487 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24488 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24493 /// Provide custom lowering hooks for some operations.
24494 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24495 switch (Op.getOpcode()) {
24496 default: llvm_unreachable("Should not custom lower this!");
24497 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24498 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24499 return LowerCMP_SWAP(Op, Subtarget, DAG);
24500 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24501 case ISD::ATOMIC_LOAD_ADD:
24502 case ISD::ATOMIC_LOAD_SUB:
24503 case ISD::ATOMIC_LOAD_OR:
24504 case ISD::ATOMIC_LOAD_XOR:
24505 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24506 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24507 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24508 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24509 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24510 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24511 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24512 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24513 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24514 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24515 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24516 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24517 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24518 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24519 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24520 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24521 case ISD::SHL_PARTS:
24522 case ISD::SRA_PARTS:
24523 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24524 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24525 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24526 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24527 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24528 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24529 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24530 case ISD::ZERO_EXTEND_VECTOR_INREG:
24531 case ISD::SIGN_EXTEND_VECTOR_INREG:
24532 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24533 case ISD::FP_TO_SINT:
24534 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24535 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24536 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24538 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24539 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24540 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24541 case ISD::SETCC: return LowerSETCC(Op, DAG);
24542 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24543 case ISD::SELECT: return LowerSELECT(Op, DAG);
24544 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24545 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24546 case ISD::VASTART: return LowerVASTART(Op, DAG);
24547 case ISD::VAARG: return LowerVAARG(Op, DAG);
24548 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24549 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
24550 case ISD::INTRINSIC_VOID:
24551 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24552 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24553 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24554 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24555 case ISD::FRAME_TO_ARGS_OFFSET:
24556 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24557 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24558 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24559 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24560 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24561 case ISD::EH_SJLJ_SETUP_DISPATCH:
24562 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24563 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24564 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24565 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24567 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24569 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24570 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24572 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24573 case ISD::UMUL_LOHI:
24574 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24576 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24579 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24585 case ISD::UMULO: return LowerXALUO(Op, DAG);
24586 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24587 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24588 case ISD::ADDCARRY:
24589 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24591 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24595 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24596 case ISD::ABS: return LowerABS(Op, DAG);
24597 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24598 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24599 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24600 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24601 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24602 case ISD::GC_TRANSITION_START:
24603 return LowerGC_TRANSITION_START(Op, DAG);
24604 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24605 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24609 /// Places new result values for the node in Results (their number
24610 /// and types must exactly match those of the original return values of
24611 /// the node), or leaves Results empty, which indicates that the node is not
24612 /// to be custom lowered after all.
24613 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24614 SmallVectorImpl<SDValue> &Results,
24615 SelectionDAG &DAG) const {
24616 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24618 if (!Res.getNode())
24621 assert((N->getNumValues() <= Res->getNumValues()) &&
24622 "Lowering returned the wrong number of results!");
24624 // Places new result values base on N result number.
24625 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24626 // than original node, chain should be dropped(last value).
24627 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24628 Results.push_back(Res.getValue(I));
24631 /// Replace a node with an illegal result type with a new node built out of
24633 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24634 SmallVectorImpl<SDValue>&Results,
24635 SelectionDAG &DAG) const {
24637 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24638 switch (N->getOpcode()) {
24640 llvm_unreachable("Do not know how to custom type legalize this operation!");
24641 case X86ISD::AVG: {
24642 // Legalize types for X86ISD::AVG by expanding vectors.
24643 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24645 auto InVT = N->getValueType(0);
24646 auto InVTSize = InVT.getSizeInBits();
24647 const unsigned RegSize =
24648 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24649 assert((Subtarget.hasBWI() || RegSize < 512) &&
24650 "512-bit vector requires AVX512BW");
24651 assert((Subtarget.hasAVX2() || RegSize < 256) &&
24652 "256-bit vector requires AVX2");
24654 auto ElemVT = InVT.getVectorElementType();
24655 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24656 RegSize / ElemVT.getSizeInBits());
24657 assert(RegSize % InVT.getSizeInBits() == 0);
24658 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24660 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24661 Ops[0] = N->getOperand(0);
24662 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24663 Ops[0] = N->getOperand(1);
24664 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24666 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24667 if (!ExperimentalVectorWideningLegalization)
24668 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24669 DAG.getIntPtrConstant(0, dl));
24670 Results.push_back(Res);
24673 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24674 case X86ISD::FMINC:
24676 case X86ISD::FMAXC:
24677 case X86ISD::FMAX: {
24678 EVT VT = N->getValueType(0);
24679 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24680 SDValue UNDEF = DAG.getUNDEF(VT);
24681 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24682 N->getOperand(0), UNDEF);
24683 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24684 N->getOperand(1), UNDEF);
24685 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24693 case ISD::UDIVREM: {
24694 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24695 Results.push_back(V);
24698 case ISD::FP_TO_SINT:
24699 case ISD::FP_TO_UINT: {
24700 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24702 if (N->getValueType(0) == MVT::v2i32) {
24703 assert((IsSigned || Subtarget.hasAVX512()) &&
24704 "Can only handle signed conversion without AVX512");
24705 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24706 SDValue Src = N->getOperand(0);
24707 if (Src.getValueType() == MVT::v2f64) {
24708 MVT ResVT = MVT::v4i32;
24709 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
24710 if (!IsSigned && !Subtarget.hasVLX()) {
24711 // Widen to 512-bits.
24712 ResVT = MVT::v8i32;
24713 Opc = ISD::FP_TO_UINT;
24714 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
24715 DAG.getUNDEF(MVT::v8f64),
24716 Src, DAG.getIntPtrConstant(0, dl));
24718 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
24719 ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
24721 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
24722 DAG.getIntPtrConstant(0, dl));
24723 Results.push_back(Res);
24726 if (Src.getValueType() == MVT::v2f32) {
24727 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24728 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24729 DAG.getUNDEF(MVT::v2f32));
24730 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24731 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24732 if (!ExperimentalVectorWideningLegalization)
24733 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24734 Results.push_back(Res);
24738 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24739 // so early out here.
24743 std::pair<SDValue,SDValue> Vals =
24744 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24745 SDValue FIST = Vals.first, StackSlot = Vals.second;
24746 if (FIST.getNode()) {
24747 EVT VT = N->getValueType(0);
24748 // Return a load from the stack slot.
24749 if (StackSlot.getNode())
24751 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24753 Results.push_back(FIST);
24757 case ISD::SINT_TO_FP: {
24758 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24759 SDValue Src = N->getOperand(0);
24760 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24762 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24765 case ISD::UINT_TO_FP: {
24766 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24767 EVT VT = N->getValueType(0);
24768 if (VT != MVT::v2f32)
24770 SDValue Src = N->getOperand(0);
24771 EVT SrcVT = Src.getValueType();
24772 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24773 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24776 if (SrcVT != MVT::v2i32)
24778 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24780 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24781 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24782 DAG.getBitcast(MVT::v2i64, VBias));
24783 Or = DAG.getBitcast(MVT::v2f64, Or);
24784 // TODO: Are there any fast-math-flags to propagate here?
24785 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24786 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24789 case ISD::FP_ROUND: {
24790 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24792 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24793 Results.push_back(V);
24796 case ISD::FP_EXTEND: {
24797 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24798 // No other ValueType for FP_EXTEND should reach this point.
24799 assert(N->getValueType(0) == MVT::v2f32 &&
24800 "Do not know how to legalize this Node");
24803 case ISD::INTRINSIC_W_CHAIN: {
24804 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24806 default : llvm_unreachable("Do not know how to custom type "
24807 "legalize this intrinsic operation!");
24808 case Intrinsic::x86_rdtsc:
24809 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24811 case Intrinsic::x86_rdtscp:
24812 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24814 case Intrinsic::x86_rdpmc:
24815 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24817 case Intrinsic::x86_xgetbv:
24818 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24821 case ISD::INTRINSIC_WO_CHAIN: {
24822 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
24823 Results.push_back(V);
24826 case ISD::READCYCLECOUNTER: {
24827 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24830 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24831 EVT T = N->getValueType(0);
24832 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24833 bool Regs64bit = T == MVT::i128;
24834 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24835 SDValue cpInL, cpInH;
24836 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24837 DAG.getConstant(0, dl, HalfT));
24838 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24839 DAG.getConstant(1, dl, HalfT));
24840 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24841 Regs64bit ? X86::RAX : X86::EAX,
24843 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24844 Regs64bit ? X86::RDX : X86::EDX,
24845 cpInH, cpInL.getValue(1));
24846 SDValue swapInL, swapInH;
24847 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24848 DAG.getConstant(0, dl, HalfT));
24849 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24850 DAG.getConstant(1, dl, HalfT));
24852 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24853 swapInH, cpInH.getValue(1));
24854 // If the current function needs the base pointer, RBX,
24855 // we shouldn't use cmpxchg directly.
24856 // Indeed the lowering of that instruction will clobber
24857 // that register and since RBX will be a reserved register
24858 // the register allocator will not make sure its value will
24859 // be properly saved and restored around this live-range.
24860 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24862 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24863 unsigned BasePtr = TRI->getBaseRegister();
24864 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24865 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24866 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24867 // ISel prefers the LCMPXCHG64 variant.
24868 // If that assert breaks, that means it is not the case anymore,
24869 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24870 // not just EBX. This is a matter of accepting i64 input for that
24871 // pseudo, and restoring into the register of the right wide
24872 // in expand pseudo. Everything else should just work.
24873 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24874 "Saving only half of the RBX");
24875 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24876 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24877 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24878 Regs64bit ? X86::RBX : X86::EBX,
24879 HalfT, swapInH.getValue(1));
24880 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24882 /*Glue*/ RBXSave.getValue(2)};
24883 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24886 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24887 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24888 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24889 swapInH.getValue(1));
24890 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24891 swapInL.getValue(1)};
24892 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24894 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24895 Regs64bit ? X86::RAX : X86::EAX,
24896 HalfT, Result.getValue(1));
24897 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24898 Regs64bit ? X86::RDX : X86::EDX,
24899 HalfT, cpOutL.getValue(2));
24900 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24902 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24903 MVT::i32, cpOutH.getValue(2));
24904 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24905 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24907 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24908 Results.push_back(Success);
24909 Results.push_back(EFLAGS.getValue(1));
24912 case ISD::ATOMIC_SWAP:
24913 case ISD::ATOMIC_LOAD_ADD:
24914 case ISD::ATOMIC_LOAD_SUB:
24915 case ISD::ATOMIC_LOAD_AND:
24916 case ISD::ATOMIC_LOAD_OR:
24917 case ISD::ATOMIC_LOAD_XOR:
24918 case ISD::ATOMIC_LOAD_NAND:
24919 case ISD::ATOMIC_LOAD_MIN:
24920 case ISD::ATOMIC_LOAD_MAX:
24921 case ISD::ATOMIC_LOAD_UMIN:
24922 case ISD::ATOMIC_LOAD_UMAX:
24923 case ISD::ATOMIC_LOAD: {
24924 // Delegate to generic TypeLegalization. Situations we can really handle
24925 // should have already been dealt with by AtomicExpandPass.cpp.
24928 case ISD::BITCAST: {
24929 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24930 EVT DstVT = N->getValueType(0);
24931 EVT SrcVT = N->getOperand(0)->getValueType(0);
24933 if (SrcVT != MVT::f64 ||
24934 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24937 unsigned NumElts = DstVT.getVectorNumElements();
24938 EVT SVT = DstVT.getVectorElementType();
24939 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24940 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24941 MVT::v2f64, N->getOperand(0));
24942 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24944 if (ExperimentalVectorWideningLegalization) {
24945 // If we are legalizing vectors by widening, we already have the desired
24946 // legal vector type, just return it.
24947 Results.push_back(ToVecInt);
24951 SmallVector<SDValue, 8> Elts;
24952 for (unsigned i = 0, e = NumElts; i != e; ++i)
24953 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24954 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24956 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24959 case ISD::MGATHER: {
24960 EVT VT = N->getValueType(0);
24961 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
24962 auto *Gather = cast<MaskedGatherSDNode>(N);
24963 SDValue Index = Gather->getIndex();
24964 if (Index.getValueType() != MVT::v2i64)
24966 SDValue Mask = Gather->getMask();
24967 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24968 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24969 Gather->getValue(),
24970 DAG.getUNDEF(MVT::v2f32));
24971 if (!Subtarget.hasVLX()) {
24972 // We need to widen the mask, but the instruction will only use 2
24973 // of its elements. So we can use undef.
24974 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24975 DAG.getUNDEF(MVT::v2i1));
24976 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
24978 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
24980 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24981 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
24982 Gather->getMemoryVT(), Gather->getMemOperand());
24983 Results.push_back(Res);
24984 Results.push_back(Res.getValue(2));
24987 if (VT == MVT::v2i32) {
24988 auto *Gather = cast<MaskedGatherSDNode>(N);
24989 SDValue Index = Gather->getIndex();
24990 SDValue Mask = Gather->getMask();
24991 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24992 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
24993 Gather->getValue(),
24994 DAG.getUNDEF(MVT::v2i32));
24995 // If the index is v2i64 we can use it directly.
24996 if (Index.getValueType() == MVT::v2i64 &&
24997 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
24998 if (!Subtarget.hasVLX()) {
24999 // We need to widen the mask, but the instruction will only use 2
25000 // of its elements. So we can use undef.
25001 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25002 DAG.getUNDEF(MVT::v2i1));
25003 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25005 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25007 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25008 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25009 Gather->getMemoryVT(), Gather->getMemOperand());
25010 SDValue Chain = Res.getValue(2);
25011 if (!ExperimentalVectorWideningLegalization)
25012 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25013 DAG.getIntPtrConstant(0, dl));
25014 Results.push_back(Res);
25015 Results.push_back(Chain);
25018 EVT IndexVT = Index.getValueType();
25019 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25020 IndexVT.getScalarType(), 4);
25021 // Otherwise we need to custom widen everything to avoid promotion.
25022 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25023 DAG.getUNDEF(IndexVT));
25024 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25025 DAG.getConstant(0, dl, MVT::v2i1));
25026 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25028 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25029 Gather->getMemoryVT(), dl, Ops,
25030 Gather->getMemOperand());
25031 SDValue Chain = Res.getValue(1);
25032 if (!ExperimentalVectorWideningLegalization)
25033 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25034 DAG.getIntPtrConstant(0, dl));
25035 Results.push_back(Res);
25036 Results.push_back(Chain);
25044 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25045 switch ((X86ISD::NodeType)Opcode) {
25046 case X86ISD::FIRST_NUMBER: break;
25047 case X86ISD::BSF: return "X86ISD::BSF";
25048 case X86ISD::BSR: return "X86ISD::BSR";
25049 case X86ISD::SHLD: return "X86ISD::SHLD";
25050 case X86ISD::SHRD: return "X86ISD::SHRD";
25051 case X86ISD::FAND: return "X86ISD::FAND";
25052 case X86ISD::FANDN: return "X86ISD::FANDN";
25053 case X86ISD::FOR: return "X86ISD::FOR";
25054 case X86ISD::FXOR: return "X86ISD::FXOR";
25055 case X86ISD::FILD: return "X86ISD::FILD";
25056 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25057 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25058 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25059 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25060 case X86ISD::FLD: return "X86ISD::FLD";
25061 case X86ISD::FST: return "X86ISD::FST";
25062 case X86ISD::CALL: return "X86ISD::CALL";
25063 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25064 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25065 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25066 case X86ISD::BT: return "X86ISD::BT";
25067 case X86ISD::CMP: return "X86ISD::CMP";
25068 case X86ISD::COMI: return "X86ISD::COMI";
25069 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25070 case X86ISD::CMPM: return "X86ISD::CMPM";
25071 case X86ISD::CMPMU: return "X86ISD::CMPMU";
25072 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25073 case X86ISD::SETCC: return "X86ISD::SETCC";
25074 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25075 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25076 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25077 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25078 case X86ISD::CMOV: return "X86ISD::CMOV";
25079 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25080 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25081 case X86ISD::IRET: return "X86ISD::IRET";
25082 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25083 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25084 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25085 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25086 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25087 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25088 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25089 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25090 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25091 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25092 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25093 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25094 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25095 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25096 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25097 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25098 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25099 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25100 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25101 case X86ISD::HADD: return "X86ISD::HADD";
25102 case X86ISD::HSUB: return "X86ISD::HSUB";
25103 case X86ISD::FHADD: return "X86ISD::FHADD";
25104 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25105 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25106 case X86ISD::FMAX: return "X86ISD::FMAX";
25107 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25108 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25109 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25110 case X86ISD::FMIN: return "X86ISD::FMIN";
25111 case X86ISD::FMINS: return "X86ISD::FMINS";
25112 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25113 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25114 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25115 case X86ISD::FMINC: return "X86ISD::FMINC";
25116 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25117 case X86ISD::FRCP: return "X86ISD::FRCP";
25118 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25119 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25120 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25121 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25122 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25123 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25124 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25125 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25126 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25127 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25128 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25129 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25130 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25131 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25132 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25133 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25134 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25135 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25136 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25137 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25138 case X86ISD::LADD: return "X86ISD::LADD";
25139 case X86ISD::LSUB: return "X86ISD::LSUB";
25140 case X86ISD::LOR: return "X86ISD::LOR";
25141 case X86ISD::LXOR: return "X86ISD::LXOR";
25142 case X86ISD::LAND: return "X86ISD::LAND";
25143 case X86ISD::LINC: return "X86ISD::LINC";
25144 case X86ISD::LDEC: return "X86ISD::LDEC";
25145 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25146 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25147 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25148 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25149 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25150 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25151 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25152 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25153 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25154 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25155 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25156 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25157 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25158 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25159 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25160 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25161 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25162 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
25163 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25164 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25165 case X86ISD::VSHL: return "X86ISD::VSHL";
25166 case X86ISD::VSRL: return "X86ISD::VSRL";
25167 case X86ISD::VSRA: return "X86ISD::VSRA";
25168 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25169 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25170 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25171 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25172 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25173 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25174 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25175 case X86ISD::CMPP: return "X86ISD::CMPP";
25176 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25177 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25178 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
25179 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
25180 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25181 case X86ISD::ADD: return "X86ISD::ADD";
25182 case X86ISD::SUB: return "X86ISD::SUB";
25183 case X86ISD::ADC: return "X86ISD::ADC";
25184 case X86ISD::SBB: return "X86ISD::SBB";
25185 case X86ISD::SMUL: return "X86ISD::SMUL";
25186 case X86ISD::UMUL: return "X86ISD::UMUL";
25187 case X86ISD::SMUL8: return "X86ISD::SMUL8";
25188 case X86ISD::UMUL8: return "X86ISD::UMUL8";
25189 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
25190 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
25191 case X86ISD::INC: return "X86ISD::INC";
25192 case X86ISD::DEC: return "X86ISD::DEC";
25193 case X86ISD::OR: return "X86ISD::OR";
25194 case X86ISD::XOR: return "X86ISD::XOR";
25195 case X86ISD::AND: return "X86ISD::AND";
25196 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
25197 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
25198 case X86ISD::PTEST: return "X86ISD::PTEST";
25199 case X86ISD::TESTP: return "X86ISD::TESTP";
25200 case X86ISD::TESTM: return "X86ISD::TESTM";
25201 case X86ISD::TESTNM: return "X86ISD::TESTNM";
25202 case X86ISD::KORTEST: return "X86ISD::KORTEST";
25203 case X86ISD::KTEST: return "X86ISD::KTEST";
25204 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
25205 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
25206 case X86ISD::PACKSS: return "X86ISD::PACKSS";
25207 case X86ISD::PACKUS: return "X86ISD::PACKUS";
25208 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
25209 case X86ISD::VALIGN: return "X86ISD::VALIGN";
25210 case X86ISD::VSHLD: return "X86ISD::VSHLD";
25211 case X86ISD::VSHRD: return "X86ISD::VSHRD";
25212 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
25213 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
25214 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
25215 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
25216 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
25217 case X86ISD::SHUFP: return "X86ISD::SHUFP";
25218 case X86ISD::SHUF128: return "X86ISD::SHUF128";
25219 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
25220 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
25221 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
25222 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
25223 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
25224 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
25225 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
25226 case X86ISD::MOVSD: return "X86ISD::MOVSD";
25227 case X86ISD::MOVSS: return "X86ISD::MOVSS";
25228 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
25229 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
25230 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
25231 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
25232 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
25233 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
25234 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
25235 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
25236 case X86ISD::VPERMV: return "X86ISD::VPERMV";
25237 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
25238 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
25239 case X86ISD::VPERMI: return "X86ISD::VPERMI";
25240 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
25241 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
25242 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
25243 case X86ISD::VRANGE: return "X86ISD::VRANGE";
25244 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
25245 case X86ISD::VRANGES: return "X86ISD::VRANGES";
25246 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
25247 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
25248 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
25249 case X86ISD::PSADBW: return "X86ISD::PSADBW";
25250 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
25251 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
25252 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
25253 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
25254 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
25255 case X86ISD::MFENCE: return "X86ISD::MFENCE";
25256 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
25257 case X86ISD::SAHF: return "X86ISD::SAHF";
25258 case X86ISD::RDRAND: return "X86ISD::RDRAND";
25259 case X86ISD::RDSEED: return "X86ISD::RDSEED";
25260 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
25261 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
25262 case X86ISD::VPSHA: return "X86ISD::VPSHA";
25263 case X86ISD::VPSHL: return "X86ISD::VPSHL";
25264 case X86ISD::VPCOM: return "X86ISD::VPCOM";
25265 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
25266 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
25267 case X86ISD::FMSUB: return "X86ISD::FMSUB";
25268 case X86ISD::FNMADD: return "X86ISD::FNMADD";
25269 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
25270 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
25271 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
25272 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
25273 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
25274 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
25275 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
25276 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
25277 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
25278 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
25279 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
25280 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
25281 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
25282 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
25283 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
25284 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
25285 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
25286 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
25287 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
25288 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
25289 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
25290 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
25291 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
25292 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
25293 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
25294 case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
25295 case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
25296 case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
25297 case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
25298 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
25299 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
25300 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
25301 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
25302 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
25303 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
25304 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
25305 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
25306 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
25307 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
25308 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
25309 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
25310 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
25311 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
25312 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
25313 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
25314 case X86ISD::XTEST: return "X86ISD::XTEST";
25315 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
25316 case X86ISD::EXPAND: return "X86ISD::EXPAND";
25317 case X86ISD::SELECT: return "X86ISD::SELECT";
25318 case X86ISD::SELECTS: return "X86ISD::SELECTS";
25319 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
25320 case X86ISD::RCP14: return "X86ISD::RCP14";
25321 case X86ISD::RCP14S: return "X86ISD::RCP14S";
25322 case X86ISD::RCP28: return "X86ISD::RCP28";
25323 case X86ISD::RCP28S: return "X86ISD::RCP28S";
25324 case X86ISD::EXP2: return "X86ISD::EXP2";
25325 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
25326 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
25327 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
25328 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
25329 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
25330 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
25331 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
25332 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
25333 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
25334 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
25335 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
25336 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
25337 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
25338 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
25339 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
25340 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
25341 case X86ISD::SCALEF: return "X86ISD::SCALEF";
25342 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
25343 case X86ISD::ADDS: return "X86ISD::ADDS";
25344 case X86ISD::SUBS: return "X86ISD::SUBS";
25345 case X86ISD::AVG: return "X86ISD::AVG";
25346 case X86ISD::MULHRS: return "X86ISD::MULHRS";
25347 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
25348 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
25349 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
25350 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
25351 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
25352 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
25353 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
25354 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
25355 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
25356 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
25357 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
25358 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
25359 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
25360 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
25361 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
25362 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
25363 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
25364 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
25365 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
25366 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
25367 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
25368 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
25369 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
25370 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
25371 case X86ISD::LWPINS: return "X86ISD::LWPINS";
25372 case X86ISD::MGATHER: return "X86ISD::MGATHER";
25373 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
25374 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
25375 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
25376 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
25377 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
25378 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
25379 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
25380 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
25381 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
25386 /// Return true if the addressing mode represented by AM is legal for this
25387 /// target, for a load/store of the specified type.
25388 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
25389 const AddrMode &AM, Type *Ty,
25391 Instruction *I) const {
25392 // X86 supports extremely general addressing modes.
25393 CodeModel::Model M = getTargetMachine().getCodeModel();
25395 // X86 allows a sign-extended 32-bit immediate field as a displacement.
25396 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
25400 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
25402 // If a reference to this global requires an extra load, we can't fold it.
25403 if (isGlobalStubReference(GVFlags))
25406 // If BaseGV requires a register for the PIC base, we cannot also have a
25407 // BaseReg specified.
25408 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
25411 // If lower 4G is not available, then we must use rip-relative addressing.
25412 if ((M != CodeModel::Small || isPositionIndependent()) &&
25413 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
25417 switch (AM.Scale) {
25423 // These scales always work.
25428 // These scales are formed with basereg+scalereg. Only accept if there is
25433 default: // Other stuff never works.
25440 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
25441 unsigned Bits = Ty->getScalarSizeInBits();
25443 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
25444 // particularly cheaper than those without.
25448 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
25449 // shifts just as cheap as scalar ones.
25450 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
25453 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
25454 // fully general vector.
25458 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
25459 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25461 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
25462 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
25463 return NumBits1 > NumBits2;
25466 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
25467 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25470 if (!isTypeLegal(EVT::getEVT(Ty1)))
25473 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
25475 // Assuming the caller doesn't have a zeroext or signext return parameter,
25476 // truncation all the way down to i1 is valid.
25480 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
25481 return isInt<32>(Imm);
25484 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
25485 // Can also use sub to handle negated immediates.
25486 return isInt<32>(Imm);
25489 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
25490 if (!VT1.isInteger() || !VT2.isInteger())
25492 unsigned NumBits1 = VT1.getSizeInBits();
25493 unsigned NumBits2 = VT2.getSizeInBits();
25494 return NumBits1 > NumBits2;
25497 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
25498 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25499 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
25502 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
25503 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25504 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
25507 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
25508 EVT VT1 = Val.getValueType();
25509 if (isZExtFree(VT1, VT2))
25512 if (Val.getOpcode() != ISD::LOAD)
25515 if (!VT1.isSimple() || !VT1.isInteger() ||
25516 !VT2.isSimple() || !VT2.isInteger())
25519 switch (VT1.getSimpleVT().SimpleTy) {
25524 // X86 has 8, 16, and 32-bit zero-extending loads.
25531 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
25534 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
25535 if (!Subtarget.hasAnyFMA())
25538 VT = VT.getScalarType();
25540 if (!VT.isSimple())
25543 switch (VT.getSimpleVT().SimpleTy) {
25554 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
25555 // i16 instructions are longer (0x66 prefix) and potentially slower.
25556 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
25559 /// Targets can use this to indicate that they only support *some*
25560 /// VECTOR_SHUFFLE operations, those with specific masks.
25561 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
25562 /// are assumed to be legal.
25563 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
25564 if (!VT.isSimple())
25567 // Not for i1 vectors
25568 if (VT.getSimpleVT().getScalarType() == MVT::i1)
25571 // Very little shuffling can be done for 64-bit vectors right now.
25572 if (VT.getSimpleVT().getSizeInBits() == 64)
25575 // We only care that the types being shuffled are legal. The lowering can
25576 // handle any possible shuffle mask that results.
25577 return isTypeLegal(VT.getSimpleVT());
25581 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
25583 // Just delegate to the generic legality, clear masks aren't special.
25584 return isShuffleMaskLegal(Mask, VT);
25587 //===----------------------------------------------------------------------===//
25588 // X86 Scheduler Hooks
25589 //===----------------------------------------------------------------------===//
25591 /// Utility function to emit xbegin specifying the start of an RTM region.
25592 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
25593 const TargetInstrInfo *TII) {
25594 DebugLoc DL = MI.getDebugLoc();
25596 const BasicBlock *BB = MBB->getBasicBlock();
25597 MachineFunction::iterator I = ++MBB->getIterator();
25599 // For the v = xbegin(), we generate
25608 // eax = # XABORT_DEF
25612 // v = phi(s0/mainBB, s1/fallBB)
25614 MachineBasicBlock *thisMBB = MBB;
25615 MachineFunction *MF = MBB->getParent();
25616 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25617 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25618 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25619 MF->insert(I, mainMBB);
25620 MF->insert(I, fallMBB);
25621 MF->insert(I, sinkMBB);
25623 // Transfer the remainder of BB and its successor edges to sinkMBB.
25624 sinkMBB->splice(sinkMBB->begin(), MBB,
25625 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25626 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25628 MachineRegisterInfo &MRI = MF->getRegInfo();
25629 unsigned DstReg = MI.getOperand(0).getReg();
25630 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25631 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25632 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25636 // # fallthrough to mainMBB
25637 // # abortion to fallMBB
25638 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25639 thisMBB->addSuccessor(mainMBB);
25640 thisMBB->addSuccessor(fallMBB);
25643 // mainDstReg := -1
25644 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25645 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25646 mainMBB->addSuccessor(sinkMBB);
25649 // ; pseudo instruction to model hardware's definition from XABORT
25650 // EAX := XABORT_DEF
25651 // fallDstReg := EAX
25652 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25653 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25655 fallMBB->addSuccessor(sinkMBB);
25658 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25659 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25660 .addReg(mainDstReg).addMBB(mainMBB)
25661 .addReg(fallDstReg).addMBB(fallMBB);
25663 MI.eraseFromParent();
25667 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25668 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25669 // in the .td file.
25670 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25671 const TargetInstrInfo *TII) {
25673 switch (MI.getOpcode()) {
25674 default: llvm_unreachable("illegal opcode!");
25675 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25676 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25677 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25678 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25679 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25680 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25681 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25682 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25685 DebugLoc dl = MI.getDebugLoc();
25686 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25688 unsigned NumArgs = MI.getNumOperands();
25689 for (unsigned i = 1; i < NumArgs; ++i) {
25690 MachineOperand &Op = MI.getOperand(i);
25691 if (!(Op.isReg() && Op.isImplicit()))
25694 if (MI.hasOneMemOperand())
25695 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25697 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25698 .addReg(X86::XMM0);
25700 MI.eraseFromParent();
25704 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25705 // defs in an instruction pattern
25706 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25707 const TargetInstrInfo *TII) {
25709 switch (MI.getOpcode()) {
25710 default: llvm_unreachable("illegal opcode!");
25711 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25712 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25713 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25714 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25715 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25716 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25717 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25718 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25721 DebugLoc dl = MI.getDebugLoc();
25722 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25724 unsigned NumArgs = MI.getNumOperands(); // remove the results
25725 for (unsigned i = 1; i < NumArgs; ++i) {
25726 MachineOperand &Op = MI.getOperand(i);
25727 if (!(Op.isReg() && Op.isImplicit()))
25730 if (MI.hasOneMemOperand())
25731 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25733 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25736 MI.eraseFromParent();
25740 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25741 const X86Subtarget &Subtarget) {
25742 DebugLoc dl = MI.getDebugLoc();
25743 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25745 // insert input VAL into EAX
25746 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25747 .addReg(MI.getOperand(0).getReg());
25748 // insert zero to ECX
25749 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25751 // insert zero to EDX
25752 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25754 // insert WRPKRU instruction
25755 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25757 MI.eraseFromParent(); // The pseudo is gone now.
25761 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25762 const X86Subtarget &Subtarget) {
25763 DebugLoc dl = MI.getDebugLoc();
25764 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25766 // insert zero to ECX
25767 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25769 // insert RDPKRU instruction
25770 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25771 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25774 MI.eraseFromParent(); // The pseudo is gone now.
25778 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25779 const X86Subtarget &Subtarget,
25781 DebugLoc dl = MI.getDebugLoc();
25782 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25783 // Address into RAX/EAX, other two args into ECX, EDX.
25784 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25785 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25786 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25787 for (int i = 0; i < X86::AddrNumOperands; ++i)
25788 MIB.add(MI.getOperand(i));
25790 unsigned ValOps = X86::AddrNumOperands;
25791 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25792 .addReg(MI.getOperand(ValOps).getReg());
25793 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25794 .addReg(MI.getOperand(ValOps + 1).getReg());
25796 // The instruction doesn't actually take any operands though.
25797 BuildMI(*BB, MI, dl, TII->get(Opc));
25799 MI.eraseFromParent(); // The pseudo is gone now.
25803 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25804 const X86Subtarget &Subtarget) {
25805 DebugLoc dl = MI->getDebugLoc();
25806 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25807 // Address into RAX/EAX
25808 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25809 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25810 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25811 for (int i = 0; i < X86::AddrNumOperands; ++i)
25812 MIB.add(MI->getOperand(i));
25814 // The instruction doesn't actually take any operands though.
25815 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25817 MI->eraseFromParent(); // The pseudo is gone now.
25823 MachineBasicBlock *
25824 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25825 MachineBasicBlock *MBB) const {
25826 // Emit va_arg instruction on X86-64.
25828 // Operands to this pseudo-instruction:
25829 // 0 ) Output : destination address (reg)
25830 // 1-5) Input : va_list address (addr, i64mem)
25831 // 6 ) ArgSize : Size (in bytes) of vararg type
25832 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25833 // 8 ) Align : Alignment of type
25834 // 9 ) EFLAGS (implicit-def)
25836 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25837 static_assert(X86::AddrNumOperands == 5,
25838 "VAARG_64 assumes 5 address operands");
25840 unsigned DestReg = MI.getOperand(0).getReg();
25841 MachineOperand &Base = MI.getOperand(1);
25842 MachineOperand &Scale = MI.getOperand(2);
25843 MachineOperand &Index = MI.getOperand(3);
25844 MachineOperand &Disp = MI.getOperand(4);
25845 MachineOperand &Segment = MI.getOperand(5);
25846 unsigned ArgSize = MI.getOperand(6).getImm();
25847 unsigned ArgMode = MI.getOperand(7).getImm();
25848 unsigned Align = MI.getOperand(8).getImm();
25850 // Memory Reference
25851 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25852 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25853 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25855 // Machine Information
25856 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25857 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25858 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25859 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25860 DebugLoc DL = MI.getDebugLoc();
25862 // struct va_list {
25865 // i64 overflow_area (address)
25866 // i64 reg_save_area (address)
25868 // sizeof(va_list) = 24
25869 // alignment(va_list) = 8
25871 unsigned TotalNumIntRegs = 6;
25872 unsigned TotalNumXMMRegs = 8;
25873 bool UseGPOffset = (ArgMode == 1);
25874 bool UseFPOffset = (ArgMode == 2);
25875 unsigned MaxOffset = TotalNumIntRegs * 8 +
25876 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25878 /* Align ArgSize to a multiple of 8 */
25879 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25880 bool NeedsAlign = (Align > 8);
25882 MachineBasicBlock *thisMBB = MBB;
25883 MachineBasicBlock *overflowMBB;
25884 MachineBasicBlock *offsetMBB;
25885 MachineBasicBlock *endMBB;
25887 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25888 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25889 unsigned OffsetReg = 0;
25891 if (!UseGPOffset && !UseFPOffset) {
25892 // If we only pull from the overflow region, we don't create a branch.
25893 // We don't need to alter control flow.
25894 OffsetDestReg = 0; // unused
25895 OverflowDestReg = DestReg;
25897 offsetMBB = nullptr;
25898 overflowMBB = thisMBB;
25901 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25902 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25903 // If not, pull from overflow_area. (branch to overflowMBB)
25908 // offsetMBB overflowMBB
25913 // Registers for the PHI in endMBB
25914 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25915 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25917 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25918 MachineFunction *MF = MBB->getParent();
25919 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25920 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25921 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25923 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25925 // Insert the new basic blocks
25926 MF->insert(MBBIter, offsetMBB);
25927 MF->insert(MBBIter, overflowMBB);
25928 MF->insert(MBBIter, endMBB);
25930 // Transfer the remainder of MBB and its successor edges to endMBB.
25931 endMBB->splice(endMBB->begin(), thisMBB,
25932 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25933 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25935 // Make offsetMBB and overflowMBB successors of thisMBB
25936 thisMBB->addSuccessor(offsetMBB);
25937 thisMBB->addSuccessor(overflowMBB);
25939 // endMBB is a successor of both offsetMBB and overflowMBB
25940 offsetMBB->addSuccessor(endMBB);
25941 overflowMBB->addSuccessor(endMBB);
25943 // Load the offset value into a register
25944 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25945 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25949 .addDisp(Disp, UseFPOffset ? 4 : 0)
25951 .setMemRefs(MMOBegin, MMOEnd);
25953 // Check if there is enough room left to pull this argument.
25954 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25956 .addImm(MaxOffset + 8 - ArgSizeA8);
25958 // Branch to "overflowMBB" if offset >= max
25959 // Fall through to "offsetMBB" otherwise
25960 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25961 .addMBB(overflowMBB);
25964 // In offsetMBB, emit code to use the reg_save_area.
25966 assert(OffsetReg != 0);
25968 // Read the reg_save_area address.
25969 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25970 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25976 .setMemRefs(MMOBegin, MMOEnd);
25978 // Zero-extend the offset
25979 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25980 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25983 .addImm(X86::sub_32bit);
25985 // Add the offset to the reg_save_area to get the final address.
25986 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25987 .addReg(OffsetReg64)
25988 .addReg(RegSaveReg);
25990 // Compute the offset for the next argument
25991 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25992 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25994 .addImm(UseFPOffset ? 16 : 8);
25996 // Store it back into the va_list.
25997 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26001 .addDisp(Disp, UseFPOffset ? 4 : 0)
26003 .addReg(NextOffsetReg)
26004 .setMemRefs(MMOBegin, MMOEnd);
26007 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26012 // Emit code to use overflow area
26015 // Load the overflow_area address into a register.
26016 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26017 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26023 .setMemRefs(MMOBegin, MMOEnd);
26025 // If we need to align it, do so. Otherwise, just copy the address
26026 // to OverflowDestReg.
26028 // Align the overflow address
26029 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26030 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26032 // aligned_addr = (addr + (align-1)) & ~(align-1)
26033 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26034 .addReg(OverflowAddrReg)
26037 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26039 .addImm(~(uint64_t)(Align-1));
26041 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26042 .addReg(OverflowAddrReg);
26045 // Compute the next overflow address after this argument.
26046 // (the overflow address should be kept 8-byte aligned)
26047 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26048 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26049 .addReg(OverflowDestReg)
26050 .addImm(ArgSizeA8);
26052 // Store the new overflow address.
26053 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26059 .addReg(NextAddrReg)
26060 .setMemRefs(MMOBegin, MMOEnd);
26062 // If we branched, emit the PHI to the front of endMBB.
26064 BuildMI(*endMBB, endMBB->begin(), DL,
26065 TII->get(X86::PHI), DestReg)
26066 .addReg(OffsetDestReg).addMBB(offsetMBB)
26067 .addReg(OverflowDestReg).addMBB(overflowMBB);
26070 // Erase the pseudo instruction
26071 MI.eraseFromParent();
26076 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26077 MachineInstr &MI, MachineBasicBlock *MBB) const {
26078 // Emit code to save XMM registers to the stack. The ABI says that the
26079 // number of registers to save is given in %al, so it's theoretically
26080 // possible to do an indirect jump trick to avoid saving all of them,
26081 // however this code takes a simpler approach and just executes all
26082 // of the stores if %al is non-zero. It's less code, and it's probably
26083 // easier on the hardware branch predictor, and stores aren't all that
26084 // expensive anyway.
26086 // Create the new basic blocks. One block contains all the XMM stores,
26087 // and one block is the final destination regardless of whether any
26088 // stores were performed.
26089 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26090 MachineFunction *F = MBB->getParent();
26091 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26092 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26093 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26094 F->insert(MBBIter, XMMSaveMBB);
26095 F->insert(MBBIter, EndMBB);
26097 // Transfer the remainder of MBB and its successor edges to EndMBB.
26098 EndMBB->splice(EndMBB->begin(), MBB,
26099 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26100 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26102 // The original block will now fall through to the XMM save block.
26103 MBB->addSuccessor(XMMSaveMBB);
26104 // The XMMSaveMBB will fall through to the end block.
26105 XMMSaveMBB->addSuccessor(EndMBB);
26107 // Now add the instructions.
26108 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26109 DebugLoc DL = MI.getDebugLoc();
26111 unsigned CountReg = MI.getOperand(0).getReg();
26112 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26113 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26115 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26116 // If %al is 0, branch around the XMM save block.
26117 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26118 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26119 MBB->addSuccessor(EndMBB);
26122 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26123 // that was just emitted, but clearly shouldn't be "saved".
26124 assert((MI.getNumOperands() <= 3 ||
26125 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26126 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26127 "Expected last argument to be EFLAGS");
26128 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26129 // In the XMM save block, save all the XMM argument registers.
26130 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26131 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26132 MachineMemOperand *MMO = F->getMachineMemOperand(
26133 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26134 MachineMemOperand::MOStore,
26135 /*Size=*/16, /*Align=*/16);
26136 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26137 .addFrameIndex(RegSaveFrameIndex)
26138 .addImm(/*Scale=*/1)
26139 .addReg(/*IndexReg=*/0)
26140 .addImm(/*Disp=*/Offset)
26141 .addReg(/*Segment=*/0)
26142 .addReg(MI.getOperand(i).getReg())
26143 .addMemOperand(MMO);
26146 MI.eraseFromParent(); // The pseudo instruction is gone now.
26151 // The EFLAGS operand of SelectItr might be missing a kill marker
26152 // because there were multiple uses of EFLAGS, and ISel didn't know
26153 // which to mark. Figure out whether SelectItr should have had a
26154 // kill marker, and set it if it should. Returns the correct kill
26156 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26157 MachineBasicBlock* BB,
26158 const TargetRegisterInfo* TRI) {
26159 // Scan forward through BB for a use/def of EFLAGS.
26160 MachineBasicBlock::iterator miI(std::next(SelectItr));
26161 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26162 const MachineInstr& mi = *miI;
26163 if (mi.readsRegister(X86::EFLAGS))
26165 if (mi.definesRegister(X86::EFLAGS))
26166 break; // Should have kill-flag - update below.
26169 // If we hit the end of the block, check whether EFLAGS is live into a
26171 if (miI == BB->end()) {
26172 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26173 sEnd = BB->succ_end();
26174 sItr != sEnd; ++sItr) {
26175 MachineBasicBlock* succ = *sItr;
26176 if (succ->isLiveIn(X86::EFLAGS))
26181 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26182 // out. SelectMI should have a kill flag on EFLAGS.
26183 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26187 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26188 // together with other CMOV pseudo-opcodes into a single basic-block with
26189 // conditional jump around it.
26190 static bool isCMOVPseudo(MachineInstr &MI) {
26191 switch (MI.getOpcode()) {
26192 case X86::CMOV_FR32:
26193 case X86::CMOV_FR64:
26194 case X86::CMOV_GR8:
26195 case X86::CMOV_GR16:
26196 case X86::CMOV_GR32:
26197 case X86::CMOV_RFP32:
26198 case X86::CMOV_RFP64:
26199 case X86::CMOV_RFP80:
26200 case X86::CMOV_V2F64:
26201 case X86::CMOV_V2I64:
26202 case X86::CMOV_V4F32:
26203 case X86::CMOV_V4F64:
26204 case X86::CMOV_V4I64:
26205 case X86::CMOV_V16F32:
26206 case X86::CMOV_V8F32:
26207 case X86::CMOV_V8F64:
26208 case X86::CMOV_V8I64:
26209 case X86::CMOV_V8I1:
26210 case X86::CMOV_V16I1:
26211 case X86::CMOV_V32I1:
26212 case X86::CMOV_V64I1:
26220 // Helper function, which inserts PHI functions into SinkMBB:
26221 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26222 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26223 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26224 // the last PHI function inserted.
26225 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26226 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26227 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26228 MachineBasicBlock *SinkMBB) {
26229 MachineFunction *MF = TrueMBB->getParent();
26230 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26231 DebugLoc DL = MIItBegin->getDebugLoc();
26233 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26234 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26236 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
26238 // As we are creating the PHIs, we have to be careful if there is more than
26239 // one. Later CMOVs may reference the results of earlier CMOVs, but later
26240 // PHIs have to reference the individual true/false inputs from earlier PHIs.
26241 // That also means that PHI construction must work forward from earlier to
26242 // later, and that the code must maintain a mapping from earlier PHI's
26243 // destination registers, and the registers that went into the PHI.
26244 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
26245 MachineInstrBuilder MIB;
26247 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
26248 unsigned DestReg = MIIt->getOperand(0).getReg();
26249 unsigned Op1Reg = MIIt->getOperand(1).getReg();
26250 unsigned Op2Reg = MIIt->getOperand(2).getReg();
26252 // If this CMOV we are generating is the opposite condition from
26253 // the jump we generated, then we have to swap the operands for the
26254 // PHI that is going to be generated.
26255 if (MIIt->getOperand(3).getImm() == OppCC)
26256 std::swap(Op1Reg, Op2Reg);
26258 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
26259 Op1Reg = RegRewriteTable[Op1Reg].first;
26261 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
26262 Op2Reg = RegRewriteTable[Op2Reg].second;
26264 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
26270 // Add this PHI to the rewrite table.
26271 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
26277 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
26278 MachineBasicBlock *
26279 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
26280 MachineInstr &SecondCascadedCMOV,
26281 MachineBasicBlock *ThisMBB) const {
26282 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26283 DebugLoc DL = FirstCMOV.getDebugLoc();
26285 // We lower cascaded CMOVs such as
26287 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
26289 // to two successive branches.
26291 // Without this, we would add a PHI between the two jumps, which ends up
26292 // creating a few copies all around. For instance, for
26294 // (sitofp (zext (fcmp une)))
26296 // we would generate:
26298 // ucomiss %xmm1, %xmm0
26299 // movss <1.0f>, %xmm0
26300 // movaps %xmm0, %xmm1
26302 // xorps %xmm1, %xmm1
26305 // movaps %xmm1, %xmm0
26309 // because this custom-inserter would have generated:
26321 // A: X = ...; Y = ...
26323 // C: Z = PHI [X, A], [Y, B]
26325 // E: PHI [X, C], [Z, D]
26327 // If we lower both CMOVs in a single step, we can instead generate:
26339 // A: X = ...; Y = ...
26341 // E: PHI [X, A], [X, C], [Y, D]
26343 // Which, in our sitofp/fcmp example, gives us something like:
26345 // ucomiss %xmm1, %xmm0
26346 // movss <1.0f>, %xmm0
26349 // xorps %xmm0, %xmm0
26354 // We lower cascaded CMOV into two successive branches to the same block.
26355 // EFLAGS is used by both, so mark it as live in the second.
26356 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26357 MachineFunction *F = ThisMBB->getParent();
26358 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26359 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26360 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26362 MachineFunction::iterator It = ++ThisMBB->getIterator();
26363 F->insert(It, FirstInsertedMBB);
26364 F->insert(It, SecondInsertedMBB);
26365 F->insert(It, SinkMBB);
26367 // For a cascaded CMOV, we lower it to two successive branches to
26368 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
26369 // the FirstInsertedMBB.
26370 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
26372 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26373 // live into the sink and copy blocks.
26374 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26375 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
26376 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
26377 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
26378 SinkMBB->addLiveIn(X86::EFLAGS);
26381 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26382 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26383 std::next(MachineBasicBlock::iterator(FirstCMOV)),
26385 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26387 // Fallthrough block for ThisMBB.
26388 ThisMBB->addSuccessor(FirstInsertedMBB);
26389 // The true block target of the first branch is always SinkMBB.
26390 ThisMBB->addSuccessor(SinkMBB);
26391 // Fallthrough block for FirstInsertedMBB.
26392 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
26393 // The true block for the branch of FirstInsertedMBB.
26394 FirstInsertedMBB->addSuccessor(SinkMBB);
26395 // This is fallthrough.
26396 SecondInsertedMBB->addSuccessor(SinkMBB);
26398 // Create the conditional branch instructions.
26399 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
26400 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
26401 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26403 X86::CondCode SecondCC =
26404 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
26405 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
26406 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
26409 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
26410 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
26411 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
26412 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
26413 MachineInstrBuilder MIB =
26414 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
26416 .addMBB(SecondInsertedMBB)
26420 // The second SecondInsertedMBB provides the same incoming value as the
26421 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
26422 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
26423 // Copy the PHI result to the register defined by the second CMOV.
26424 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
26425 TII->get(TargetOpcode::COPY),
26426 SecondCascadedCMOV.getOperand(0).getReg())
26427 .addReg(FirstCMOV.getOperand(0).getReg());
26429 // Now remove the CMOVs.
26430 FirstCMOV.eraseFromParent();
26431 SecondCascadedCMOV.eraseFromParent();
26436 MachineBasicBlock *
26437 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
26438 MachineBasicBlock *ThisMBB) const {
26439 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26440 DebugLoc DL = MI.getDebugLoc();
26442 // To "insert" a SELECT_CC instruction, we actually have to insert the
26443 // diamond control-flow pattern. The incoming instruction knows the
26444 // destination vreg to set, the condition code register to branch on, the
26445 // true/false values to select between and a branch opcode to use.
26450 // cmpTY ccX, r1, r2
26452 // fallthrough --> FalseMBB
26454 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
26455 // as described above, by inserting a BB, and then making a PHI at the join
26456 // point to select the true and false operands of the CMOV in the PHI.
26458 // The code also handles two different cases of multiple CMOV opcodes
26462 // In this case, there are multiple CMOVs in a row, all which are based on
26463 // the same condition setting (or the exact opposite condition setting).
26464 // In this case we can lower all the CMOVs using a single inserted BB, and
26465 // then make a number of PHIs at the join point to model the CMOVs. The only
26466 // trickiness here, is that in a case like:
26468 // t2 = CMOV cond1 t1, f1
26469 // t3 = CMOV cond1 t2, f2
26471 // when rewriting this into PHIs, we have to perform some renaming on the
26472 // temps since you cannot have a PHI operand refer to a PHI result earlier
26473 // in the same block. The "simple" but wrong lowering would be:
26475 // t2 = PHI t1(BB1), f1(BB2)
26476 // t3 = PHI t2(BB1), f2(BB2)
26478 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
26479 // renaming is to note that on the path through BB1, t2 is really just a
26480 // copy of t1, and do that renaming, properly generating:
26482 // t2 = PHI t1(BB1), f1(BB2)
26483 // t3 = PHI t1(BB1), f2(BB2)
26486 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
26487 // function - EmitLoweredCascadedSelect.
26489 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
26490 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26491 MachineInstr *LastCMOV = &MI;
26492 MachineBasicBlock::iterator NextMIIt =
26493 std::next(MachineBasicBlock::iterator(MI));
26495 // Check for case 1, where there are multiple CMOVs with the same condition
26496 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
26497 // number of jumps the most.
26499 if (isCMOVPseudo(MI)) {
26500 // See if we have a string of CMOVS with the same condition.
26501 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
26502 (NextMIIt->getOperand(3).getImm() == CC ||
26503 NextMIIt->getOperand(3).getImm() == OppCC)) {
26504 LastCMOV = &*NextMIIt;
26509 // This checks for case 2, but only do this if we didn't already find
26510 // case 1, as indicated by LastCMOV == MI.
26511 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
26512 NextMIIt->getOpcode() == MI.getOpcode() &&
26513 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
26514 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
26515 NextMIIt->getOperand(1).isKill()) {
26516 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
26519 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26520 MachineFunction *F = ThisMBB->getParent();
26521 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
26522 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26524 MachineFunction::iterator It = ++ThisMBB->getIterator();
26525 F->insert(It, FalseMBB);
26526 F->insert(It, SinkMBB);
26528 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26529 // live into the sink and copy blocks.
26530 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26531 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
26532 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
26533 FalseMBB->addLiveIn(X86::EFLAGS);
26534 SinkMBB->addLiveIn(X86::EFLAGS);
26537 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26538 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26539 std::next(MachineBasicBlock::iterator(LastCMOV)),
26541 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26543 // Fallthrough block for ThisMBB.
26544 ThisMBB->addSuccessor(FalseMBB);
26545 // The true block target of the first (or only) branch is always a SinkMBB.
26546 ThisMBB->addSuccessor(SinkMBB);
26547 // Fallthrough block for FalseMBB.
26548 FalseMBB->addSuccessor(SinkMBB);
26550 // Create the conditional branch instruction.
26551 unsigned Opc = X86::GetCondBranchFromCond(CC);
26552 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26555 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
26557 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
26558 MachineBasicBlock::iterator MIItEnd =
26559 std::next(MachineBasicBlock::iterator(LastCMOV));
26560 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
26562 // Now remove the CMOV(s).
26563 ThisMBB->erase(MIItBegin, MIItEnd);
26568 MachineBasicBlock *
26569 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
26570 MachineBasicBlock *BB) const {
26571 // Combine the following atomic floating-point modification pattern:
26572 // a.store(reg OP a.load(acquire), release)
26573 // Transform them into:
26574 // OPss (%gpr), %xmm
26575 // movss %xmm, (%gpr)
26576 // Or sd equivalent for 64-bit operations.
26578 switch (MI.getOpcode()) {
26579 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
26580 case X86::RELEASE_FADD32mr:
26581 FOp = X86::ADDSSrm;
26582 MOp = X86::MOVSSmr;
26584 case X86::RELEASE_FADD64mr:
26585 FOp = X86::ADDSDrm;
26586 MOp = X86::MOVSDmr;
26589 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26590 DebugLoc DL = MI.getDebugLoc();
26591 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
26592 unsigned ValOpIdx = X86::AddrNumOperands;
26593 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
26594 MachineInstrBuilder MIB =
26595 BuildMI(*BB, MI, DL, TII->get(FOp),
26596 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
26598 for (int i = 0; i < X86::AddrNumOperands; ++i) {
26599 MachineOperand &Operand = MI.getOperand(i);
26600 // Clear any kill flags on register operands as we'll create a second
26601 // instruction using the same address operands.
26602 if (Operand.isReg())
26603 Operand.setIsKill(false);
26606 MachineInstr *FOpMI = MIB;
26607 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
26608 for (int i = 0; i < X86::AddrNumOperands; ++i)
26609 MIB.add(MI.getOperand(i));
26610 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
26611 MI.eraseFromParent(); // The pseudo instruction is gone now.
26615 MachineBasicBlock *
26616 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
26617 MachineBasicBlock *BB) const {
26618 MachineFunction *MF = BB->getParent();
26619 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26620 DebugLoc DL = MI.getDebugLoc();
26621 const BasicBlock *LLVM_BB = BB->getBasicBlock();
26623 assert(MF->shouldSplitStack());
26625 const bool Is64Bit = Subtarget.is64Bit();
26626 const bool IsLP64 = Subtarget.isTarget64BitLP64();
26628 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
26629 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
26632 // ... [Till the alloca]
26633 // If stacklet is not large enough, jump to mallocMBB
26636 // Allocate by subtracting from RSP
26637 // Jump to continueMBB
26640 // Allocate by call to runtime
26644 // [rest of original BB]
26647 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26648 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26649 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26651 MachineRegisterInfo &MRI = MF->getRegInfo();
26652 const TargetRegisterClass *AddrRegClass =
26653 getRegClassFor(getPointerTy(MF->getDataLayout()));
26655 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26656 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26657 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
26658 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
26659 sizeVReg = MI.getOperand(1).getReg(),
26661 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
26663 MachineFunction::iterator MBBIter = ++BB->getIterator();
26665 MF->insert(MBBIter, bumpMBB);
26666 MF->insert(MBBIter, mallocMBB);
26667 MF->insert(MBBIter, continueMBB);
26669 continueMBB->splice(continueMBB->begin(), BB,
26670 std::next(MachineBasicBlock::iterator(MI)), BB->end());
26671 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26673 // Add code to the main basic block to check if the stack limit has been hit,
26674 // and if so, jump to mallocMBB otherwise to bumpMBB.
26675 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26676 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26677 .addReg(tmpSPVReg).addReg(sizeVReg);
26678 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26679 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26680 .addReg(SPLimitVReg);
26681 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26683 // bumpMBB simply decreases the stack pointer, since we know the current
26684 // stacklet has enough space.
26685 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26686 .addReg(SPLimitVReg);
26687 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26688 .addReg(SPLimitVReg);
26689 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26691 // Calls into a routine in libgcc to allocate more space from the heap.
26692 const uint32_t *RegMask =
26693 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26695 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26697 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26698 .addExternalSymbol("__morestack_allocate_stack_space")
26699 .addRegMask(RegMask)
26700 .addReg(X86::RDI, RegState::Implicit)
26701 .addReg(X86::RAX, RegState::ImplicitDefine);
26702 } else if (Is64Bit) {
26703 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26705 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26706 .addExternalSymbol("__morestack_allocate_stack_space")
26707 .addRegMask(RegMask)
26708 .addReg(X86::EDI, RegState::Implicit)
26709 .addReg(X86::EAX, RegState::ImplicitDefine);
26711 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26713 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26714 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26715 .addExternalSymbol("__morestack_allocate_stack_space")
26716 .addRegMask(RegMask)
26717 .addReg(X86::EAX, RegState::ImplicitDefine);
26721 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26724 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26725 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26726 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26728 // Set up the CFG correctly.
26729 BB->addSuccessor(bumpMBB);
26730 BB->addSuccessor(mallocMBB);
26731 mallocMBB->addSuccessor(continueMBB);
26732 bumpMBB->addSuccessor(continueMBB);
26734 // Take care of the PHI nodes.
26735 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26736 MI.getOperand(0).getReg())
26737 .addReg(mallocPtrVReg)
26739 .addReg(bumpSPPtrVReg)
26742 // Delete the original pseudo instruction.
26743 MI.eraseFromParent();
26746 return continueMBB;
26749 MachineBasicBlock *
26750 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26751 MachineBasicBlock *BB) const {
26752 MachineFunction *MF = BB->getParent();
26753 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26754 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26755 DebugLoc DL = MI.getDebugLoc();
26757 assert(!isAsynchronousEHPersonality(
26758 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
26759 "SEH does not use catchret!");
26761 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26762 if (!Subtarget.is32Bit())
26765 // C++ EH creates a new target block to hold the restore code, and wires up
26766 // the new block to the return destination with a normal JMP_4.
26767 MachineBasicBlock *RestoreMBB =
26768 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26769 assert(BB->succ_size() == 1);
26770 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26771 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26772 BB->addSuccessor(RestoreMBB);
26773 MI.getOperand(0).setMBB(RestoreMBB);
26775 auto RestoreMBBI = RestoreMBB->begin();
26776 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26777 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26781 MachineBasicBlock *
26782 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26783 MachineBasicBlock *BB) const {
26784 MachineFunction *MF = BB->getParent();
26785 const Constant *PerFn = MF->getFunction().getPersonalityFn();
26786 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26787 // Only 32-bit SEH requires special handling for catchpad.
26788 if (IsSEH && Subtarget.is32Bit()) {
26789 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26790 DebugLoc DL = MI.getDebugLoc();
26791 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26793 MI.eraseFromParent();
26797 MachineBasicBlock *
26798 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26799 MachineBasicBlock *BB) const {
26800 // So, here we replace TLSADDR with the sequence:
26801 // adjust_stackdown -> TLSADDR -> adjust_stackup.
26802 // We need this because TLSADDR is lowered into calls
26803 // inside MC, therefore without the two markers shrink-wrapping
26804 // may push the prologue/epilogue pass them.
26805 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26806 DebugLoc DL = MI.getDebugLoc();
26807 MachineFunction &MF = *BB->getParent();
26809 // Emit CALLSEQ_START right before the instruction.
26810 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26811 MachineInstrBuilder CallseqStart =
26812 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26813 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26815 // Emit CALLSEQ_END right after the instruction.
26816 // We don't call erase from parent because we want to keep the
26817 // original instruction around.
26818 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26819 MachineInstrBuilder CallseqEnd =
26820 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26821 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26826 MachineBasicBlock *
26827 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26828 MachineBasicBlock *BB) const {
26829 // This is pretty easy. We're taking the value that we received from
26830 // our load from the relocation, sticking it in either RDI (x86-64)
26831 // or EAX and doing an indirect call. The return value will then
26832 // be in the normal return register.
26833 MachineFunction *F = BB->getParent();
26834 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26835 DebugLoc DL = MI.getDebugLoc();
26837 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26838 assert(MI.getOperand(3).isGlobal() && "This should be a global");
26840 // Get a register mask for the lowered call.
26841 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26842 // proper register mask.
26843 const uint32_t *RegMask =
26844 Subtarget.is64Bit() ?
26845 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26846 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26847 if (Subtarget.is64Bit()) {
26848 MachineInstrBuilder MIB =
26849 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26853 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26854 MI.getOperand(3).getTargetFlags())
26856 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26857 addDirectMem(MIB, X86::RDI);
26858 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26859 } else if (!isPositionIndependent()) {
26860 MachineInstrBuilder MIB =
26861 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26865 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26866 MI.getOperand(3).getTargetFlags())
26868 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26869 addDirectMem(MIB, X86::EAX);
26870 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26872 MachineInstrBuilder MIB =
26873 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26874 .addReg(TII->getGlobalBaseReg(F))
26877 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26878 MI.getOperand(3).getTargetFlags())
26880 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26881 addDirectMem(MIB, X86::EAX);
26882 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26885 MI.eraseFromParent(); // The pseudo instruction is gone now.
26889 MachineBasicBlock *
26890 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26891 MachineBasicBlock *MBB) const {
26892 DebugLoc DL = MI.getDebugLoc();
26893 MachineFunction *MF = MBB->getParent();
26894 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26895 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26896 MachineRegisterInfo &MRI = MF->getRegInfo();
26898 const BasicBlock *BB = MBB->getBasicBlock();
26899 MachineFunction::iterator I = ++MBB->getIterator();
26901 // Memory Reference
26902 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26903 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26906 unsigned MemOpndSlot = 0;
26908 unsigned CurOp = 0;
26910 DstReg = MI.getOperand(CurOp++).getReg();
26911 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26912 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
26914 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26915 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26917 MemOpndSlot = CurOp;
26919 MVT PVT = getPointerTy(MF->getDataLayout());
26920 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26921 "Invalid Pointer Size!");
26923 // For v = setjmp(buf), we generate
26926 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26927 // SjLjSetup restoreMBB
26933 // v = phi(main, restore)
26936 // if base pointer being used, load it from frame
26939 MachineBasicBlock *thisMBB = MBB;
26940 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26941 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26942 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26943 MF->insert(I, mainMBB);
26944 MF->insert(I, sinkMBB);
26945 MF->push_back(restoreMBB);
26946 restoreMBB->setHasAddressTaken();
26948 MachineInstrBuilder MIB;
26950 // Transfer the remainder of BB and its successor edges to sinkMBB.
26951 sinkMBB->splice(sinkMBB->begin(), MBB,
26952 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26953 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26956 unsigned PtrStoreOpc = 0;
26957 unsigned LabelReg = 0;
26958 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26959 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26960 !isPositionIndependent();
26962 // Prepare IP either in reg or imm.
26963 if (!UseImmLabel) {
26964 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26965 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26966 LabelReg = MRI.createVirtualRegister(PtrRC);
26967 if (Subtarget.is64Bit()) {
26968 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26972 .addMBB(restoreMBB)
26975 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26976 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26977 .addReg(XII->getGlobalBaseReg(MF))
26980 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26984 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26986 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26987 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26988 if (i == X86::AddrDisp)
26989 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26991 MIB.add(MI.getOperand(MemOpndSlot + i));
26994 MIB.addReg(LabelReg);
26996 MIB.addMBB(restoreMBB);
26997 MIB.setMemRefs(MMOBegin, MMOEnd);
26999 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27000 .addMBB(restoreMBB);
27002 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27003 MIB.addRegMask(RegInfo->getNoPreservedMask());
27004 thisMBB->addSuccessor(mainMBB);
27005 thisMBB->addSuccessor(restoreMBB);
27009 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27010 mainMBB->addSuccessor(sinkMBB);
27013 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27014 TII->get(X86::PHI), DstReg)
27015 .addReg(mainDstReg).addMBB(mainMBB)
27016 .addReg(restoreDstReg).addMBB(restoreMBB);
27019 if (RegInfo->hasBasePointer(*MF)) {
27020 const bool Uses64BitFramePtr =
27021 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27022 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27023 X86FI->setRestoreBasePointer(MF);
27024 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27025 unsigned BasePtr = RegInfo->getBaseRegister();
27026 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27027 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27028 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27029 .setMIFlag(MachineInstr::FrameSetup);
27031 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27032 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27033 restoreMBB->addSuccessor(sinkMBB);
27035 MI.eraseFromParent();
27039 MachineBasicBlock *
27040 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
27041 MachineBasicBlock *MBB) const {
27042 DebugLoc DL = MI.getDebugLoc();
27043 MachineFunction *MF = MBB->getParent();
27044 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27045 MachineRegisterInfo &MRI = MF->getRegInfo();
27047 // Memory Reference
27048 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27049 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27051 MVT PVT = getPointerTy(MF->getDataLayout());
27052 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27053 "Invalid Pointer Size!");
27055 const TargetRegisterClass *RC =
27056 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27057 unsigned Tmp = MRI.createVirtualRegister(RC);
27058 // Since FP is only updated here but NOT referenced, it's treated as GPR.
27059 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27060 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
27061 unsigned SP = RegInfo->getStackRegister();
27063 MachineInstrBuilder MIB;
27065 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27066 const int64_t SPOffset = 2 * PVT.getStoreSize();
27068 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
27069 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
27072 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
27073 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
27074 MIB.add(MI.getOperand(i));
27075 MIB.setMemRefs(MMOBegin, MMOEnd);
27077 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
27078 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27079 if (i == X86::AddrDisp)
27080 MIB.addDisp(MI.getOperand(i), LabelOffset);
27082 MIB.add(MI.getOperand(i));
27084 MIB.setMemRefs(MMOBegin, MMOEnd);
27086 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
27087 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27088 if (i == X86::AddrDisp)
27089 MIB.addDisp(MI.getOperand(i), SPOffset);
27091 MIB.add(MI.getOperand(i));
27093 MIB.setMemRefs(MMOBegin, MMOEnd);
27095 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
27097 MI.eraseFromParent();
27101 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
27102 MachineBasicBlock *MBB,
27103 MachineBasicBlock *DispatchBB,
27105 DebugLoc DL = MI.getDebugLoc();
27106 MachineFunction *MF = MBB->getParent();
27107 MachineRegisterInfo *MRI = &MF->getRegInfo();
27108 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27110 MVT PVT = getPointerTy(MF->getDataLayout());
27111 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
27116 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27117 !isPositionIndependent();
27120 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27122 const TargetRegisterClass *TRC =
27123 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27124 VR = MRI->createVirtualRegister(TRC);
27125 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27127 if (Subtarget.is64Bit())
27128 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
27132 .addMBB(DispatchBB)
27135 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
27136 .addReg(0) /* TII->getGlobalBaseReg(MF) */
27139 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
27143 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
27144 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
27146 MIB.addMBB(DispatchBB);
27151 MachineBasicBlock *
27152 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
27153 MachineBasicBlock *BB) const {
27154 DebugLoc DL = MI.getDebugLoc();
27155 MachineFunction *MF = BB->getParent();
27156 MachineFrameInfo &MFI = MF->getFrameInfo();
27157 MachineRegisterInfo *MRI = &MF->getRegInfo();
27158 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27159 int FI = MFI.getFunctionContextIndex();
27161 // Get a mapping of the call site numbers to all of the landing pads they're
27162 // associated with.
27163 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
27164 unsigned MaxCSNum = 0;
27165 for (auto &MBB : *MF) {
27166 if (!MBB.isEHPad())
27169 MCSymbol *Sym = nullptr;
27170 for (const auto &MI : MBB) {
27171 if (MI.isDebugValue())
27174 assert(MI.isEHLabel() && "expected EH_LABEL");
27175 Sym = MI.getOperand(0).getMCSymbol();
27179 if (!MF->hasCallSiteLandingPad(Sym))
27182 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
27183 CallSiteNumToLPad[CSI].push_back(&MBB);
27184 MaxCSNum = std::max(MaxCSNum, CSI);
27188 // Get an ordered list of the machine basic blocks for the jump table.
27189 std::vector<MachineBasicBlock *> LPadList;
27190 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
27191 LPadList.reserve(CallSiteNumToLPad.size());
27193 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
27194 for (auto &LP : CallSiteNumToLPad[CSI]) {
27195 LPadList.push_back(LP);
27196 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
27200 assert(!LPadList.empty() &&
27201 "No landing pad destinations for the dispatch jump table!");
27203 // Create the MBBs for the dispatch code.
27205 // Shove the dispatch's address into the return slot in the function context.
27206 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
27207 DispatchBB->setIsEHPad(true);
27209 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
27210 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
27211 DispatchBB->addSuccessor(TrapBB);
27213 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
27214 DispatchBB->addSuccessor(DispContBB);
27217 MF->push_back(DispatchBB);
27218 MF->push_back(DispContBB);
27219 MF->push_back(TrapBB);
27221 // Insert code into the entry block that creates and registers the function
27223 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
27225 // Create the jump table and associated information
27226 unsigned JTE = getJumpTableEncoding();
27227 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
27228 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
27230 const X86RegisterInfo &RI = TII->getRegisterInfo();
27231 // Add a register mask with no preserved registers. This results in all
27232 // registers being marked as clobbered.
27233 if (RI.hasBasePointer(*MF)) {
27234 const bool FPIs64Bit =
27235 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27236 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
27237 MFI->setRestoreBasePointer(MF);
27239 unsigned FP = RI.getFrameRegister(*MF);
27240 unsigned BP = RI.getBaseRegister();
27241 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
27242 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
27243 MFI->getRestoreBasePointerOffset())
27244 .addRegMask(RI.getNoPreservedMask());
27246 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
27247 .addRegMask(RI.getNoPreservedMask());
27250 // IReg is used as an index in a memory operand and therefore can't be SP
27251 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
27252 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
27253 Subtarget.is64Bit() ? 8 : 4);
27254 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
27256 .addImm(LPadList.size());
27257 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
27259 if (Subtarget.is64Bit()) {
27260 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27261 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
27263 // leaq .LJTI0_0(%rip), BReg
27264 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
27268 .addJumpTableIndex(MJTI)
27270 // movzx IReg64, IReg
27271 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
27274 .addImm(X86::sub_32bit);
27277 case MachineJumpTableInfo::EK_BlockAddress:
27278 // jmpq *(BReg,IReg64,8)
27279 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
27286 case MachineJumpTableInfo::EK_LabelDifference32: {
27287 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
27288 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
27289 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27291 // movl (BReg,IReg64,4), OReg
27292 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
27298 // movsx OReg64, OReg
27299 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
27300 // addq BReg, OReg64, TReg
27301 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
27305 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
27309 llvm_unreachable("Unexpected jump table encoding");
27312 // jmpl *.LJTI0_0(,IReg,4)
27313 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
27317 .addJumpTableIndex(MJTI)
27321 // Add the jump table entries as successors to the MBB.
27322 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
27323 for (auto &LP : LPadList)
27324 if (SeenMBBs.insert(LP).second)
27325 DispContBB->addSuccessor(LP);
27327 // N.B. the order the invoke BBs are processed in doesn't matter here.
27328 SmallVector<MachineBasicBlock *, 64> MBBLPads;
27329 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
27330 for (MachineBasicBlock *MBB : InvokeBBs) {
27331 // Remove the landing pad successor from the invoke block and replace it
27332 // with the new dispatch block.
27333 // Keep a copy of Successors since it's modified inside the loop.
27334 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
27336 // FIXME: Avoid quadratic complexity.
27337 for (auto MBBS : Successors) {
27338 if (MBBS->isEHPad()) {
27339 MBB->removeSuccessor(MBBS);
27340 MBBLPads.push_back(MBBS);
27344 MBB->addSuccessor(DispatchBB);
27346 // Find the invoke call and mark all of the callee-saved registers as
27347 // 'implicit defined' so that they're spilled. This prevents code from
27348 // moving instructions to before the EH block, where they will never be
27350 for (auto &II : reverse(*MBB)) {
27354 DenseMap<unsigned, bool> DefRegs;
27355 for (auto &MOp : II.operands())
27357 DefRegs[MOp.getReg()] = true;
27359 MachineInstrBuilder MIB(*MF, &II);
27360 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
27361 unsigned Reg = SavedRegs[RI];
27363 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
27370 // Mark all former landing pads as non-landing pads. The dispatch is the only
27371 // landing pad now.
27372 for (auto &LP : MBBLPads)
27373 LP->setIsEHPad(false);
27375 // The instruction is gone now.
27376 MI.eraseFromParent();
27380 MachineBasicBlock *
27381 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
27382 MachineBasicBlock *BB) const {
27383 MachineFunction *MF = BB->getParent();
27384 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27385 DebugLoc DL = MI.getDebugLoc();
27387 switch (MI.getOpcode()) {
27388 default: llvm_unreachable("Unexpected instr type to insert");
27389 case X86::TAILJMPd64:
27390 case X86::TAILJMPr64:
27391 case X86::TAILJMPm64:
27392 case X86::TAILJMPr64_REX:
27393 case X86::TAILJMPm64_REX:
27394 llvm_unreachable("TAILJMP64 would not be touched here.");
27395 case X86::TCRETURNdi64:
27396 case X86::TCRETURNri64:
27397 case X86::TCRETURNmi64:
27399 case X86::TLS_addr32:
27400 case X86::TLS_addr64:
27401 case X86::TLS_base_addr32:
27402 case X86::TLS_base_addr64:
27403 return EmitLoweredTLSAddr(MI, BB);
27404 case X86::CATCHRET:
27405 return EmitLoweredCatchRet(MI, BB);
27406 case X86::CATCHPAD:
27407 return EmitLoweredCatchPad(MI, BB);
27408 case X86::SEG_ALLOCA_32:
27409 case X86::SEG_ALLOCA_64:
27410 return EmitLoweredSegAlloca(MI, BB);
27411 case X86::TLSCall_32:
27412 case X86::TLSCall_64:
27413 return EmitLoweredTLSCall(MI, BB);
27414 case X86::CMOV_FR32:
27415 case X86::CMOV_FR64:
27416 case X86::CMOV_FR128:
27417 case X86::CMOV_GR8:
27418 case X86::CMOV_GR16:
27419 case X86::CMOV_GR32:
27420 case X86::CMOV_RFP32:
27421 case X86::CMOV_RFP64:
27422 case X86::CMOV_RFP80:
27423 case X86::CMOV_V2F64:
27424 case X86::CMOV_V2I64:
27425 case X86::CMOV_V4F32:
27426 case X86::CMOV_V4F64:
27427 case X86::CMOV_V4I64:
27428 case X86::CMOV_V16F32:
27429 case X86::CMOV_V8F32:
27430 case X86::CMOV_V8F64:
27431 case X86::CMOV_V8I64:
27432 case X86::CMOV_V8I1:
27433 case X86::CMOV_V16I1:
27434 case X86::CMOV_V32I1:
27435 case X86::CMOV_V64I1:
27436 return EmitLoweredSelect(MI, BB);
27438 case X86::RDFLAGS32:
27439 case X86::RDFLAGS64: {
27441 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
27442 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
27443 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
27444 // Permit reads of the FLAGS register without it being defined.
27445 // This intrinsic exists to read external processor state in flags, such as
27446 // the trap flag, interrupt flag, and direction flag, none of which are
27447 // modeled by the backend.
27448 Push->getOperand(2).setIsUndef();
27449 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
27451 MI.eraseFromParent(); // The pseudo is gone now.
27455 case X86::WRFLAGS32:
27456 case X86::WRFLAGS64: {
27458 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
27460 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
27461 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
27462 BuildMI(*BB, MI, DL, TII->get(PopF));
27464 MI.eraseFromParent(); // The pseudo is gone now.
27468 case X86::RELEASE_FADD32mr:
27469 case X86::RELEASE_FADD64mr:
27470 return EmitLoweredAtomicFP(MI, BB);
27472 case X86::FP32_TO_INT16_IN_MEM:
27473 case X86::FP32_TO_INT32_IN_MEM:
27474 case X86::FP32_TO_INT64_IN_MEM:
27475 case X86::FP64_TO_INT16_IN_MEM:
27476 case X86::FP64_TO_INT32_IN_MEM:
27477 case X86::FP64_TO_INT64_IN_MEM:
27478 case X86::FP80_TO_INT16_IN_MEM:
27479 case X86::FP80_TO_INT32_IN_MEM:
27480 case X86::FP80_TO_INT64_IN_MEM: {
27481 // Change the floating point control register to use "round towards zero"
27482 // mode when truncating to an integer value.
27483 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
27484 addFrameReference(BuildMI(*BB, MI, DL,
27485 TII->get(X86::FNSTCW16m)), CWFrameIdx);
27487 // Load the old value of the high byte of the control word...
27489 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
27490 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
27493 // Set the high part to be round to zero...
27494 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
27497 // Reload the modified control word now...
27498 addFrameReference(BuildMI(*BB, MI, DL,
27499 TII->get(X86::FLDCW16m)), CWFrameIdx);
27501 // Restore the memory image of control word to original value
27502 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
27505 // Get the X86 opcode to use.
27507 switch (MI.getOpcode()) {
27508 default: llvm_unreachable("illegal opcode!");
27509 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
27510 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
27511 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
27512 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
27513 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
27514 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
27515 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
27516 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
27517 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
27520 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27521 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
27522 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
27524 // Reload the original control word now.
27525 addFrameReference(BuildMI(*BB, MI, DL,
27526 TII->get(X86::FLDCW16m)), CWFrameIdx);
27528 MI.eraseFromParent(); // The pseudo instruction is gone now.
27531 // String/text processing lowering.
27532 case X86::PCMPISTRM128REG:
27533 case X86::VPCMPISTRM128REG:
27534 case X86::PCMPISTRM128MEM:
27535 case X86::VPCMPISTRM128MEM:
27536 case X86::PCMPESTRM128REG:
27537 case X86::VPCMPESTRM128REG:
27538 case X86::PCMPESTRM128MEM:
27539 case X86::VPCMPESTRM128MEM:
27540 assert(Subtarget.hasSSE42() &&
27541 "Target must have SSE4.2 or AVX features enabled");
27542 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
27544 // String/text processing lowering.
27545 case X86::PCMPISTRIREG:
27546 case X86::VPCMPISTRIREG:
27547 case X86::PCMPISTRIMEM:
27548 case X86::VPCMPISTRIMEM:
27549 case X86::PCMPESTRIREG:
27550 case X86::VPCMPESTRIREG:
27551 case X86::PCMPESTRIMEM:
27552 case X86::VPCMPESTRIMEM:
27553 assert(Subtarget.hasSSE42() &&
27554 "Target must have SSE4.2 or AVX features enabled");
27555 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
27557 // Thread synchronization.
27559 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
27560 case X86::MONITORX:
27561 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
27565 return emitClzero(&MI, BB, Subtarget);
27569 return emitWRPKRU(MI, BB, Subtarget);
27571 return emitRDPKRU(MI, BB, Subtarget);
27574 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
27576 case X86::VASTART_SAVE_XMM_REGS:
27577 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
27579 case X86::VAARG_64:
27580 return EmitVAARG64WithCustomInserter(MI, BB);
27582 case X86::EH_SjLj_SetJmp32:
27583 case X86::EH_SjLj_SetJmp64:
27584 return emitEHSjLjSetJmp(MI, BB);
27586 case X86::EH_SjLj_LongJmp32:
27587 case X86::EH_SjLj_LongJmp64:
27588 return emitEHSjLjLongJmp(MI, BB);
27590 case X86::Int_eh_sjlj_setup_dispatch:
27591 return EmitSjLjDispatchBlock(MI, BB);
27593 case TargetOpcode::STATEPOINT:
27594 // As an implementation detail, STATEPOINT shares the STACKMAP format at
27595 // this point in the process. We diverge later.
27596 return emitPatchPoint(MI, BB);
27598 case TargetOpcode::STACKMAP:
27599 case TargetOpcode::PATCHPOINT:
27600 return emitPatchPoint(MI, BB);
27602 case TargetOpcode::PATCHABLE_EVENT_CALL:
27603 // Do nothing here, handle in xray instrumentation pass.
27606 case X86::LCMPXCHG8B: {
27607 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
27608 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
27609 // requires a memory operand. If it happens that current architecture is
27610 // i686 and for current function we need a base pointer
27611 // - which is ESI for i686 - register allocator would not be able to
27612 // allocate registers for an address in form of X(%reg, %reg, Y)
27613 // - there never would be enough unreserved registers during regalloc
27614 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
27615 // We are giving a hand to register allocator by precomputing the address in
27616 // a new vreg using LEA.
27618 // If it is not i686 or there is no base pointer - nothing to do here.
27619 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
27622 // Even though this code does not necessarily needs the base pointer to
27623 // be ESI, we check for that. The reason: if this assert fails, there are
27624 // some changes happened in the compiler base pointer handling, which most
27625 // probably have to be addressed somehow here.
27626 assert(TRI->getBaseRegister() == X86::ESI &&
27627 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
27628 "base pointer in mind");
27630 MachineRegisterInfo &MRI = MF->getRegInfo();
27631 MVT SPTy = getPointerTy(MF->getDataLayout());
27632 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27633 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
27635 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27636 // Regalloc does not need any help when the memory operand of CMPXCHG8B
27637 // does not use index register.
27638 if (AM.IndexReg == X86::NoRegister)
27641 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
27642 // four operand definitions that are E[ABCD] registers. We skip them and
27643 // then insert the LEA.
27644 MachineBasicBlock::iterator MBBI(MI);
27645 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
27646 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
27649 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
27651 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
27655 case X86::LCMPXCHG16B:
27657 case X86::LCMPXCHG8B_SAVE_EBX:
27658 case X86::LCMPXCHG16B_SAVE_RBX: {
27660 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
27661 if (!BB->isLiveIn(BasePtr))
27662 BB->addLiveIn(BasePtr);
27668 //===----------------------------------------------------------------------===//
27669 // X86 Optimization Hooks
27670 //===----------------------------------------------------------------------===//
27672 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
27674 const APInt &DemandedElts,
27675 const SelectionDAG &DAG,
27676 unsigned Depth) const {
27677 unsigned BitWidth = Known.getBitWidth();
27678 unsigned Opc = Op.getOpcode();
27679 EVT VT = Op.getValueType();
27680 assert((Opc >= ISD::BUILTIN_OP_END ||
27681 Opc == ISD::INTRINSIC_WO_CHAIN ||
27682 Opc == ISD::INTRINSIC_W_CHAIN ||
27683 Opc == ISD::INTRINSIC_VOID) &&
27684 "Should use MaskedValueIsZero if you don't know whether Op"
27685 " is a target node!");
27690 case X86ISD::SETCC:
27691 Known.Zero.setBitsFrom(1);
27693 case X86ISD::MOVMSK: {
27694 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
27695 Known.Zero.setBitsFrom(NumLoBits);
27698 case X86ISD::PEXTRB:
27699 case X86ISD::PEXTRW: {
27700 SDValue Src = Op.getOperand(0);
27701 EVT SrcVT = Src.getValueType();
27702 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
27703 Op.getConstantOperandVal(1));
27704 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
27705 Known = Known.zextOrTrunc(BitWidth);
27706 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
27709 case X86ISD::VSHLI:
27710 case X86ISD::VSRLI: {
27711 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
27712 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
27713 Known.setAllZero();
27717 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
27718 unsigned ShAmt = ShiftImm->getZExtValue();
27719 if (Opc == X86ISD::VSHLI) {
27720 Known.Zero <<= ShAmt;
27721 Known.One <<= ShAmt;
27722 // Low bits are known zero.
27723 Known.Zero.setLowBits(ShAmt);
27725 Known.Zero.lshrInPlace(ShAmt);
27726 Known.One.lshrInPlace(ShAmt);
27727 // High bits are known zero.
27728 Known.Zero.setHighBits(ShAmt);
27733 case X86ISD::VZEXT: {
27734 // TODO: Add DemandedElts support.
27735 SDValue N0 = Op.getOperand(0);
27736 unsigned NumElts = VT.getVectorNumElements();
27738 EVT SrcVT = N0.getValueType();
27739 unsigned InNumElts = SrcVT.getVectorNumElements();
27740 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27741 assert(InNumElts >= NumElts && "Illegal VZEXT input");
27743 Known = KnownBits(InBitWidth);
27744 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27745 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27746 Known = Known.zext(BitWidth);
27747 Known.Zero.setBitsFrom(InBitWidth);
27750 case X86ISD::CMOV: {
27751 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
27752 // If we don't know any bits, early out.
27753 if (Known.isUnknown())
27756 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
27758 // Only known if known in both the LHS and RHS.
27759 Known.One &= Known2.One;
27760 Known.Zero &= Known2.Zero;
27763 case X86ISD::UDIVREM8_ZEXT_HREG:
27764 // TODO: Support more than just the zero extended bits?
27765 if (Op.getResNo() != 1)
27767 // The remainder is zero extended.
27768 Known.Zero.setBitsFrom(8);
27773 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27774 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27775 unsigned Depth) const {
27776 unsigned VTBits = Op.getScalarValueSizeInBits();
27777 unsigned Opcode = Op.getOpcode();
27779 case X86ISD::SETCC_CARRY:
27780 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27783 case X86ISD::VSEXT: {
27784 // TODO: Add DemandedElts support.
27785 SDValue Src = Op.getOperand(0);
27786 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27787 Tmp += VTBits - Src.getScalarValueSizeInBits();
27791 case X86ISD::VTRUNC: {
27792 // TODO: Add DemandedElts support.
27793 SDValue Src = Op.getOperand(0);
27794 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
27795 assert(VTBits < NumSrcBits && "Illegal truncation input type");
27796 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27797 if (Tmp > (NumSrcBits - VTBits))
27798 return Tmp - (NumSrcBits - VTBits);
27802 case X86ISD::PACKSS: {
27803 // PACKSS is just a truncation if the sign bits extend to the packed size.
27804 // TODO: Add DemandedElts support.
27805 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
27806 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
27807 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
27808 unsigned Tmp = std::min(Tmp0, Tmp1);
27809 if (Tmp > (SrcBits - VTBits))
27810 return Tmp - (SrcBits - VTBits);
27814 case X86ISD::VSHLI: {
27815 SDValue Src = Op.getOperand(0);
27816 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27817 if (ShiftVal.uge(VTBits))
27818 return VTBits; // Shifted all bits out --> zero.
27819 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
27820 if (ShiftVal.uge(Tmp))
27821 return 1; // Shifted all sign bits out --> unknown.
27822 return Tmp - ShiftVal.getZExtValue();
27825 case X86ISD::VSRAI: {
27826 SDValue Src = Op.getOperand(0);
27827 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27828 if (ShiftVal.uge(VTBits - 1))
27829 return VTBits; // Sign splat.
27830 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
27832 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
27835 case X86ISD::PCMPGT:
27836 case X86ISD::PCMPEQ:
27838 case X86ISD::VPCOM:
27839 case X86ISD::VPCOMU:
27840 // Vector compares return zero/all-bits result values.
27843 case X86ISD::CMOV: {
27844 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
27845 if (Tmp0 == 1) return 1; // Early out.
27846 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
27847 return std::min(Tmp0, Tmp1);
27849 case X86ISD::SDIVREM8_SEXT_HREG:
27850 // TODO: Support more than just the sign extended bits?
27851 if (Op.getResNo() != 1)
27853 // The remainder is sign extended.
27861 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
27862 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
27863 return N->getOperand(0);
27867 /// Returns true (and the GlobalValue and the offset) if the node is a
27868 /// GlobalAddress + offset.
27869 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27870 const GlobalValue* &GA,
27871 int64_t &Offset) const {
27872 if (N->getOpcode() == X86ISD::Wrapper) {
27873 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
27874 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27875 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27879 return TargetLowering::isGAPlusOffset(N, GA, Offset);
27882 // Attempt to match a combined shuffle mask against supported unary shuffle
27884 // TODO: Investigate sharing more of this with shuffle lowering.
27885 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27886 bool AllowFloatDomain, bool AllowIntDomain,
27887 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27888 const X86Subtarget &Subtarget,
27889 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27890 unsigned NumMaskElts = Mask.size();
27891 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27893 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27894 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27895 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
27896 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
27897 unsigned MaxScale = 64 / MaskEltSize;
27898 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
27900 unsigned NumDstElts = NumMaskElts / Scale;
27901 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
27902 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27903 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27906 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27907 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
27908 MVT::getIntegerVT(MaskEltSize);
27909 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
27911 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
27912 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27913 Shuffle = unsigned(X86ISD::VZEXT);
27915 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27917 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27918 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27924 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27925 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27926 isUndefOrEqual(Mask[0], 0) &&
27927 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27928 Shuffle = X86ISD::VZEXT_MOVL;
27929 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27933 // Check if we have SSE3 which will let us use MOVDDUP etc. The
27934 // instructions are no slower than UNPCKLPD but has the option to
27935 // fold the input operand into even an unaligned memory load.
27936 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27937 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
27938 Shuffle = X86ISD::MOVDDUP;
27939 SrcVT = DstVT = MVT::v2f64;
27942 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27943 Shuffle = X86ISD::MOVSLDUP;
27944 SrcVT = DstVT = MVT::v4f32;
27947 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27948 Shuffle = X86ISD::MOVSHDUP;
27949 SrcVT = DstVT = MVT::v4f32;
27954 if (MaskVT.is256BitVector() && AllowFloatDomain) {
27955 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
27956 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27957 Shuffle = X86ISD::MOVDDUP;
27958 SrcVT = DstVT = MVT::v4f64;
27961 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27962 Shuffle = X86ISD::MOVSLDUP;
27963 SrcVT = DstVT = MVT::v8f32;
27966 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27967 Shuffle = X86ISD::MOVSHDUP;
27968 SrcVT = DstVT = MVT::v8f32;
27973 if (MaskVT.is512BitVector() && AllowFloatDomain) {
27974 assert(Subtarget.hasAVX512() &&
27975 "AVX512 required for 512-bit vector shuffles");
27976 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27977 Shuffle = X86ISD::MOVDDUP;
27978 SrcVT = DstVT = MVT::v8f64;
27981 if (isTargetShuffleEquivalent(
27982 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27983 Shuffle = X86ISD::MOVSLDUP;
27984 SrcVT = DstVT = MVT::v16f32;
27987 if (isTargetShuffleEquivalent(
27988 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27989 Shuffle = X86ISD::MOVSHDUP;
27990 SrcVT = DstVT = MVT::v16f32;
27995 // Attempt to match against broadcast-from-vector.
27996 if (Subtarget.hasAVX2()) {
27997 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27998 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27999 SrcVT = DstVT = MaskVT;
28000 Shuffle = X86ISD::VBROADCAST;
28008 // Attempt to match a combined shuffle mask against supported unary immediate
28009 // permute instructions.
28010 // TODO: Investigate sharing more of this with shuffle lowering.
28011 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28012 const APInt &Zeroable,
28013 bool AllowFloatDomain,
28014 bool AllowIntDomain,
28015 const X86Subtarget &Subtarget,
28016 unsigned &Shuffle, MVT &ShuffleVT,
28017 unsigned &PermuteImm) {
28018 unsigned NumMaskElts = Mask.size();
28019 unsigned InputSizeInBits = MaskVT.getSizeInBits();
28020 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
28021 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
28023 bool ContainsZeros =
28024 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28026 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
28027 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
28028 // Check for lane crossing permutes.
28029 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
28030 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
28031 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
28032 Shuffle = X86ISD::VPERMI;
28033 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
28034 PermuteImm = getV4X86ShuffleImm(Mask);
28037 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
28038 SmallVector<int, 4> RepeatedMask;
28039 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
28040 Shuffle = X86ISD::VPERMI;
28041 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
28042 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
28046 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
28047 // VPERMILPD can permute with a non-repeating shuffle.
28048 Shuffle = X86ISD::VPERMILPI;
28049 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
28051 for (int i = 0, e = Mask.size(); i != e; ++i) {
28053 if (M == SM_SentinelUndef)
28055 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
28056 PermuteImm |= (M & 1) << i;
28062 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
28063 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
28064 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
28065 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
28066 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
28067 SmallVector<int, 4> RepeatedMask;
28068 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28069 // Narrow the repeated mask to create 32-bit element permutes.
28070 SmallVector<int, 4> WordMask = RepeatedMask;
28071 if (MaskScalarSizeInBits == 64)
28072 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
28074 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
28075 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
28076 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
28077 PermuteImm = getV4X86ShuffleImm(WordMask);
28082 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
28083 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
28084 SmallVector<int, 4> RepeatedMask;
28085 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28086 ArrayRef<int> LoMask(Mask.data() + 0, 4);
28087 ArrayRef<int> HiMask(Mask.data() + 4, 4);
28089 // PSHUFLW: permute lower 4 elements only.
28090 if (isUndefOrInRange(LoMask, 0, 4) &&
28091 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
28092 Shuffle = X86ISD::PSHUFLW;
28093 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28094 PermuteImm = getV4X86ShuffleImm(LoMask);
28098 // PSHUFHW: permute upper 4 elements only.
28099 if (isUndefOrInRange(HiMask, 4, 8) &&
28100 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
28101 // Offset the HiMask so that we can create the shuffle immediate.
28102 int OffsetHiMask[4];
28103 for (int i = 0; i != 4; ++i)
28104 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
28106 Shuffle = X86ISD::PSHUFHW;
28107 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28108 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
28114 // Attempt to match against byte/bit shifts.
28115 // FIXME: Add 512-bit support.
28116 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28117 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28118 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
28119 MaskScalarSizeInBits, Mask,
28120 0, Zeroable, Subtarget);
28121 if (0 < ShiftAmt) {
28122 PermuteImm = (unsigned)ShiftAmt;
28130 // Attempt to match a combined unary shuffle mask against supported binary
28131 // shuffle instructions.
28132 // TODO: Investigate sharing more of this with shuffle lowering.
28133 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28134 bool AllowFloatDomain, bool AllowIntDomain,
28135 SDValue &V1, SDValue &V2, SDLoc &DL,
28137 const X86Subtarget &Subtarget,
28138 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
28140 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28142 if (MaskVT.is128BitVector()) {
28143 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
28145 Shuffle = X86ISD::MOVLHPS;
28146 SrcVT = DstVT = MVT::v4f32;
28149 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
28151 Shuffle = X86ISD::MOVHLPS;
28152 SrcVT = DstVT = MVT::v4f32;
28155 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
28156 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28158 Shuffle = X86ISD::MOVSD;
28159 SrcVT = DstVT = MaskVT;
28162 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
28163 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28164 Shuffle = X86ISD::MOVSS;
28165 SrcVT = DstVT = MaskVT;
28170 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
28171 // TODO add support for 256/512-bit types.
28172 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
28173 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
28180 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
28181 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
28182 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28183 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
28184 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
28185 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
28186 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
28188 SrcVT = DstVT = MaskVT;
28189 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
28190 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
28198 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28199 const APInt &Zeroable,
28200 bool AllowFloatDomain,
28201 bool AllowIntDomain,
28202 SDValue &V1, SDValue &V2, SDLoc &DL,
28204 const X86Subtarget &Subtarget,
28205 unsigned &Shuffle, MVT &ShuffleVT,
28206 unsigned &PermuteImm) {
28207 unsigned NumMaskElts = Mask.size();
28208 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28210 // Attempt to match against PALIGNR byte rotate.
28211 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28212 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28213 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
28214 if (0 < ByteRotation) {
28215 Shuffle = X86ISD::PALIGNR;
28216 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
28217 PermuteImm = ByteRotation;
28222 // Attempt to combine to X86ISD::BLENDI.
28223 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
28224 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
28225 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
28226 uint64_t BlendMask = 0;
28227 bool ForceV1Zero = false, ForceV2Zero = false;
28228 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
28229 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
28231 if (MaskVT == MVT::v16i16) {
28232 // We can only use v16i16 PBLENDW if the lanes are repeated.
28233 SmallVector<int, 8> RepeatedMask;
28234 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
28236 assert(RepeatedMask.size() == 8 &&
28237 "Repeated mask size doesn't match!");
28239 for (int i = 0; i < 8; ++i)
28240 if (RepeatedMask[i] >= 8)
28241 PermuteImm |= 1 << i;
28242 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28243 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28244 Shuffle = X86ISD::BLENDI;
28245 ShuffleVT = MaskVT;
28249 // Determine a type compatible with X86ISD::BLENDI.
28250 ShuffleVT = MaskVT;
28251 if (Subtarget.hasAVX2()) {
28252 if (ShuffleVT == MVT::v4i64)
28253 ShuffleVT = MVT::v8i32;
28254 else if (ShuffleVT == MVT::v2i64)
28255 ShuffleVT = MVT::v4i32;
28257 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
28258 ShuffleVT = MVT::v8i16;
28259 else if (ShuffleVT == MVT::v4i64)
28260 ShuffleVT = MVT::v4f64;
28261 else if (ShuffleVT == MVT::v8i32)
28262 ShuffleVT = MVT::v8f32;
28265 if (!ShuffleVT.isFloatingPoint()) {
28266 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
28268 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
28269 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
28270 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
28273 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28274 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28275 PermuteImm = (unsigned)BlendMask;
28276 Shuffle = X86ISD::BLENDI;
28282 // Attempt to combine to INSERTPS.
28283 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
28284 MaskVT.is128BitVector()) {
28285 if (Zeroable.getBoolValue() &&
28286 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
28287 Shuffle = X86ISD::INSERTPS;
28288 ShuffleVT = MVT::v4f32;
28293 // Attempt to combine to SHUFPD.
28294 if (AllowFloatDomain && EltSizeInBits == 64 &&
28295 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28296 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28297 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28298 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
28299 Shuffle = X86ISD::SHUFP;
28300 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
28305 // Attempt to combine to SHUFPS.
28306 if (AllowFloatDomain && EltSizeInBits == 32 &&
28307 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
28308 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28309 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28310 SmallVector<int, 4> RepeatedMask;
28311 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
28312 // Match each half of the repeated mask, to determine if its just
28313 // referencing one of the vectors, is zeroable or entirely undef.
28314 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
28315 int M0 = RepeatedMask[Offset];
28316 int M1 = RepeatedMask[Offset + 1];
28318 if (isUndefInRange(RepeatedMask, Offset, 2)) {
28319 return DAG.getUNDEF(MaskVT);
28320 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
28321 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
28322 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
28323 return getZeroVector(MaskVT, Subtarget, DAG, DL);
28324 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
28325 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28326 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28328 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
28329 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28330 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28337 int ShufMask[4] = {-1, -1, -1, -1};
28338 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
28339 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
28344 Shuffle = X86ISD::SHUFP;
28345 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
28346 PermuteImm = getV4X86ShuffleImm(ShufMask);
28355 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
28358 /// This is the leaf of the recursive combine below. When we have found some
28359 /// chain of single-use x86 shuffle instructions and accumulated the combined
28360 /// shuffle mask represented by them, this will try to pattern match that mask
28361 /// into either a single instruction if there is a special purpose instruction
28362 /// for this operation, or into a PSHUFB instruction which is a fully general
28363 /// instruction but should only be used to replace chains over a certain depth.
28364 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
28365 ArrayRef<int> BaseMask, int Depth,
28366 bool HasVariableMask, SelectionDAG &DAG,
28367 TargetLowering::DAGCombinerInfo &DCI,
28368 const X86Subtarget &Subtarget) {
28369 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
28370 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
28371 "Unexpected number of shuffle inputs!");
28373 // Find the inputs that enter the chain. Note that multiple uses are OK
28374 // here, we're not going to remove the operands we find.
28375 bool UnaryShuffle = (Inputs.size() == 1);
28376 SDValue V1 = peekThroughBitcasts(Inputs[0]);
28377 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
28378 : peekThroughBitcasts(Inputs[1]));
28380 MVT VT1 = V1.getSimpleValueType();
28381 MVT VT2 = V2.getSimpleValueType();
28382 MVT RootVT = Root.getSimpleValueType();
28383 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
28384 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
28385 "Vector size mismatch");
28390 unsigned NumBaseMaskElts = BaseMask.size();
28391 if (NumBaseMaskElts == 1) {
28392 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
28393 return DAG.getBitcast(RootVT, V1);
28396 unsigned RootSizeInBits = RootVT.getSizeInBits();
28397 unsigned NumRootElts = RootVT.getVectorNumElements();
28398 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
28399 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
28400 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
28402 // Don't combine if we are a AVX512/EVEX target and the mask element size
28403 // is different from the root element size - this would prevent writemasks
28404 // from being reused.
28405 // TODO - this currently prevents all lane shuffles from occurring.
28406 // TODO - check for writemasks usage instead of always preventing combining.
28407 // TODO - attempt to narrow Mask back to writemask size.
28408 bool IsEVEXShuffle =
28409 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
28410 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
28413 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
28415 // Handle 128-bit lane shuffles of 256-bit vectors.
28416 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
28417 // we need to use the zeroing feature.
28418 // TODO - this should support binary shuffles.
28419 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
28420 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
28421 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
28422 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
28423 return SDValue(); // Nothing to do!
28424 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
28425 unsigned PermMask = 0;
28426 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
28427 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
28429 Res = DAG.getBitcast(ShuffleVT, V1);
28430 DCI.AddToWorklist(Res.getNode());
28431 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
28432 DAG.getUNDEF(ShuffleVT),
28433 DAG.getConstant(PermMask, DL, MVT::i8));
28434 DCI.AddToWorklist(Res.getNode());
28435 return DAG.getBitcast(RootVT, Res);
28438 // For masks that have been widened to 128-bit elements or more,
28439 // narrow back down to 64-bit elements.
28440 SmallVector<int, 64> Mask;
28441 if (BaseMaskEltSizeInBits > 64) {
28442 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
28443 int MaskScale = BaseMaskEltSizeInBits / 64;
28444 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
28446 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
28449 unsigned NumMaskElts = Mask.size();
28450 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
28452 // Determine the effective mask value type.
28453 FloatDomain &= (32 <= MaskEltSizeInBits);
28454 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
28455 : MVT::getIntegerVT(MaskEltSizeInBits);
28456 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
28458 // Only allow legal mask types.
28459 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
28462 // Attempt to match the mask against known shuffle patterns.
28463 MVT ShuffleSrcVT, ShuffleVT;
28464 unsigned Shuffle, PermuteImm;
28466 // Which shuffle domains are permitted?
28467 // Permit domain crossing at higher combine depths.
28468 bool AllowFloatDomain = FloatDomain || (Depth > 3);
28469 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
28470 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
28472 // Determine zeroable mask elements.
28473 APInt Zeroable(NumMaskElts, 0);
28474 for (unsigned i = 0; i != NumMaskElts; ++i)
28475 if (isUndefOrZero(Mask[i]))
28476 Zeroable.setBit(i);
28478 if (UnaryShuffle) {
28479 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
28480 // directly if we don't shuffle the lower element and we shuffle the upper
28481 // (zero) elements within themselves.
28482 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
28483 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
28484 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
28485 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
28486 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
28487 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
28488 return DAG.getBitcast(RootVT, V1);
28492 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28493 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
28495 if (Depth == 1 && Root.getOpcode() == Shuffle)
28496 return SDValue(); // Nothing to do!
28497 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28498 return SDValue(); // AVX512 Writemask clash.
28499 Res = DAG.getBitcast(ShuffleSrcVT, V1);
28500 DCI.AddToWorklist(Res.getNode());
28501 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
28502 DCI.AddToWorklist(Res.getNode());
28503 return DAG.getBitcast(RootVT, Res);
28506 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28507 AllowIntDomain, Subtarget, Shuffle,
28508 ShuffleVT, PermuteImm)) {
28509 if (Depth == 1 && Root.getOpcode() == Shuffle)
28510 return SDValue(); // Nothing to do!
28511 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28512 return SDValue(); // AVX512 Writemask clash.
28513 Res = DAG.getBitcast(ShuffleVT, V1);
28514 DCI.AddToWorklist(Res.getNode());
28515 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
28516 DAG.getConstant(PermuteImm, DL, MVT::i8));
28517 DCI.AddToWorklist(Res.getNode());
28518 return DAG.getBitcast(RootVT, Res);
28522 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28523 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
28524 ShuffleVT, UnaryShuffle)) {
28525 if (Depth == 1 && Root.getOpcode() == Shuffle)
28526 return SDValue(); // Nothing to do!
28527 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28528 return SDValue(); // AVX512 Writemask clash.
28529 V1 = DAG.getBitcast(ShuffleSrcVT, V1);
28530 DCI.AddToWorklist(V1.getNode());
28531 V2 = DAG.getBitcast(ShuffleSrcVT, V2);
28532 DCI.AddToWorklist(V2.getNode());
28533 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
28534 DCI.AddToWorklist(Res.getNode());
28535 return DAG.getBitcast(RootVT, Res);
28538 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28539 AllowIntDomain, V1, V2, DL, DAG,
28540 Subtarget, Shuffle, ShuffleVT,
28542 if (Depth == 1 && Root.getOpcode() == Shuffle)
28543 return SDValue(); // Nothing to do!
28544 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28545 return SDValue(); // AVX512 Writemask clash.
28546 V1 = DAG.getBitcast(ShuffleVT, V1);
28547 DCI.AddToWorklist(V1.getNode());
28548 V2 = DAG.getBitcast(ShuffleVT, V2);
28549 DCI.AddToWorklist(V2.getNode());
28550 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
28551 DAG.getConstant(PermuteImm, DL, MVT::i8));
28552 DCI.AddToWorklist(Res.getNode());
28553 return DAG.getBitcast(RootVT, Res);
28556 // Typically from here on, we need an integer version of MaskVT.
28557 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
28558 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
28560 // Annoyingly, SSE4A instructions don't map into the above match helpers.
28561 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
28562 uint64_t BitLen, BitIdx;
28563 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
28565 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
28566 return SDValue(); // Nothing to do!
28567 V1 = DAG.getBitcast(IntMaskVT, V1);
28568 DCI.AddToWorklist(V1.getNode());
28569 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
28570 DAG.getConstant(BitLen, DL, MVT::i8),
28571 DAG.getConstant(BitIdx, DL, MVT::i8));
28572 DCI.AddToWorklist(Res.getNode());
28573 return DAG.getBitcast(RootVT, Res);
28576 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
28577 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
28578 return SDValue(); // Nothing to do!
28579 V1 = DAG.getBitcast(IntMaskVT, V1);
28580 DCI.AddToWorklist(V1.getNode());
28581 V2 = DAG.getBitcast(IntMaskVT, V2);
28582 DCI.AddToWorklist(V2.getNode());
28583 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
28584 DAG.getConstant(BitLen, DL, MVT::i8),
28585 DAG.getConstant(BitIdx, DL, MVT::i8));
28586 DCI.AddToWorklist(Res.getNode());
28587 return DAG.getBitcast(RootVT, Res);
28591 // Don't try to re-form single instruction chains under any circumstances now
28592 // that we've done encoding canonicalization for them.
28596 // Depth threshold above which we can efficiently use variable mask shuffles.
28597 // TODO This should probably be target specific.
28598 bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
28600 bool MaskContainsZeros =
28601 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28603 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
28604 // If we have a single input lane-crossing shuffle then lower to VPERMV.
28605 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28606 ((Subtarget.hasAVX2() &&
28607 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28608 (Subtarget.hasAVX512() &&
28609 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28610 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28611 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28612 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28613 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28614 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28615 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28616 DCI.AddToWorklist(VPermMask.getNode());
28617 Res = DAG.getBitcast(MaskVT, V1);
28618 DCI.AddToWorklist(Res.getNode());
28619 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
28620 DCI.AddToWorklist(Res.getNode());
28621 return DAG.getBitcast(RootVT, Res);
28624 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
28625 // vector as the second source.
28626 if (UnaryShuffle && AllowVariableMask &&
28627 ((Subtarget.hasAVX512() &&
28628 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28629 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28630 (Subtarget.hasVLX() &&
28631 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28632 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28633 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28634 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28635 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28636 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28637 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
28638 for (unsigned i = 0; i != NumMaskElts; ++i)
28639 if (Mask[i] == SM_SentinelZero)
28640 Mask[i] = NumMaskElts + i;
28642 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28643 DCI.AddToWorklist(VPermMask.getNode());
28644 Res = DAG.getBitcast(MaskVT, V1);
28645 DCI.AddToWorklist(Res.getNode());
28646 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
28647 DCI.AddToWorklist(Zero.getNode());
28648 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
28649 DCI.AddToWorklist(Res.getNode());
28650 return DAG.getBitcast(RootVT, Res);
28653 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
28654 if (AllowVariableMask && !MaskContainsZeros &&
28655 ((Subtarget.hasAVX512() &&
28656 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28657 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28658 (Subtarget.hasVLX() &&
28659 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28660 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28661 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28662 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28663 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28664 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28665 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28666 DCI.AddToWorklist(VPermMask.getNode());
28667 V1 = DAG.getBitcast(MaskVT, V1);
28668 DCI.AddToWorklist(V1.getNode());
28669 V2 = DAG.getBitcast(MaskVT, V2);
28670 DCI.AddToWorklist(V2.getNode());
28671 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
28672 DCI.AddToWorklist(Res.getNode());
28673 return DAG.getBitcast(RootVT, Res);
28678 // See if we can combine a single input shuffle with zeros to a bit-mask,
28679 // which is much simpler than any shuffle.
28680 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
28681 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
28682 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
28683 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
28684 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
28685 APInt UndefElts(NumMaskElts, 0);
28686 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
28687 for (unsigned i = 0; i != NumMaskElts; ++i) {
28689 if (M == SM_SentinelUndef) {
28690 UndefElts.setBit(i);
28693 if (M == SM_SentinelZero)
28695 EltBits[i] = AllOnes;
28697 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
28698 DCI.AddToWorklist(BitMask.getNode());
28699 Res = DAG.getBitcast(MaskVT, V1);
28700 DCI.AddToWorklist(Res.getNode());
28701 unsigned AndOpcode =
28702 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
28703 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
28704 DCI.AddToWorklist(Res.getNode());
28705 return DAG.getBitcast(RootVT, Res);
28708 // If we have a single input shuffle with different shuffle patterns in the
28709 // the 128-bit lanes use the variable mask to VPERMILPS.
28710 // TODO Combine other mask types at higher depths.
28711 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28712 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
28713 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
28714 SmallVector<SDValue, 16> VPermIdx;
28715 for (int M : Mask) {
28717 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
28718 VPermIdx.push_back(Idx);
28720 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
28721 DCI.AddToWorklist(VPermMask.getNode());
28722 Res = DAG.getBitcast(MaskVT, V1);
28723 DCI.AddToWorklist(Res.getNode());
28724 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
28725 DCI.AddToWorklist(Res.getNode());
28726 return DAG.getBitcast(RootVT, Res);
28729 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
28730 // to VPERMIL2PD/VPERMIL2PS.
28731 if (AllowVariableMask && Subtarget.hasXOP() &&
28732 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
28733 MaskVT == MVT::v8f32)) {
28734 // VPERMIL2 Operation.
28735 // Bits[3] - Match Bit.
28736 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
28737 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
28738 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
28739 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
28740 SmallVector<int, 8> VPerm2Idx;
28741 unsigned M2ZImm = 0;
28742 for (int M : Mask) {
28743 if (M == SM_SentinelUndef) {
28744 VPerm2Idx.push_back(-1);
28747 if (M == SM_SentinelZero) {
28749 VPerm2Idx.push_back(8);
28752 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
28753 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
28754 VPerm2Idx.push_back(Index);
28756 V1 = DAG.getBitcast(MaskVT, V1);
28757 DCI.AddToWorklist(V1.getNode());
28758 V2 = DAG.getBitcast(MaskVT, V2);
28759 DCI.AddToWorklist(V2.getNode());
28760 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
28761 DCI.AddToWorklist(VPerm2MaskOp.getNode());
28762 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
28763 DAG.getConstant(M2ZImm, DL, MVT::i8));
28764 DCI.AddToWorklist(Res.getNode());
28765 return DAG.getBitcast(RootVT, Res);
28768 // If we have 3 or more shuffle instructions or a chain involving a variable
28769 // mask, we can replace them with a single PSHUFB instruction profitably.
28770 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
28771 // instructions, but in practice PSHUFB tends to be *very* fast so we're
28772 // more aggressive.
28773 if (UnaryShuffle && AllowVariableMask &&
28774 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28775 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
28776 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
28777 SmallVector<SDValue, 16> PSHUFBMask;
28778 int NumBytes = RootVT.getSizeInBits() / 8;
28779 int Ratio = NumBytes / NumMaskElts;
28780 for (int i = 0; i < NumBytes; ++i) {
28781 int M = Mask[i / Ratio];
28782 if (M == SM_SentinelUndef) {
28783 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
28786 if (M == SM_SentinelZero) {
28787 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
28790 M = Ratio * M + i % Ratio;
28791 assert((M / 16) == (i / 16) && "Lane crossing detected");
28792 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28794 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
28795 Res = DAG.getBitcast(ByteVT, V1);
28796 DCI.AddToWorklist(Res.getNode());
28797 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28798 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28799 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28800 DCI.AddToWorklist(Res.getNode());
28801 return DAG.getBitcast(RootVT, Res);
28804 // With XOP, if we have a 128-bit binary input shuffle we can always combine
28805 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28806 // slower than PSHUFB on targets that support both.
28807 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
28808 // VPPERM Mask Operation
28809 // Bits[4:0] - Byte Index (0 - 31)
28810 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28811 SmallVector<SDValue, 16> VPPERMMask;
28813 int Ratio = NumBytes / NumMaskElts;
28814 for (int i = 0; i < NumBytes; ++i) {
28815 int M = Mask[i / Ratio];
28816 if (M == SM_SentinelUndef) {
28817 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28820 if (M == SM_SentinelZero) {
28821 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28824 M = Ratio * M + i % Ratio;
28825 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28827 MVT ByteVT = MVT::v16i8;
28828 V1 = DAG.getBitcast(ByteVT, V1);
28829 DCI.AddToWorklist(V1.getNode());
28830 V2 = DAG.getBitcast(ByteVT, V2);
28831 DCI.AddToWorklist(V2.getNode());
28832 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28833 DCI.AddToWorklist(VPPERMMaskOp.getNode());
28834 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28835 DCI.AddToWorklist(Res.getNode());
28836 return DAG.getBitcast(RootVT, Res);
28839 // Failed to find any combines.
28843 // Attempt to constant fold all of the constant source ops.
28844 // Returns true if the entire shuffle is folded to a constant.
28845 // TODO: Extend this to merge multiple constant Ops and update the mask.
28846 static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28847 ArrayRef<int> Mask, SDValue Root,
28848 bool HasVariableMask,
28850 TargetLowering::DAGCombinerInfo &DCI,
28851 const X86Subtarget &Subtarget) {
28852 MVT VT = Root.getSimpleValueType();
28854 unsigned SizeInBits = VT.getSizeInBits();
28855 unsigned NumMaskElts = Mask.size();
28856 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28857 unsigned NumOps = Ops.size();
28859 // Extract constant bits from each source op.
28860 bool OneUseConstantOp = false;
28861 SmallVector<APInt, 16> UndefEltsOps(NumOps);
28862 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28863 for (unsigned i = 0; i != NumOps; ++i) {
28864 SDValue SrcOp = Ops[i];
28865 OneUseConstantOp |= SrcOp.hasOneUse();
28866 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28871 // Only fold if at least one of the constants is only used once or
28872 // the combined shuffle has included a variable mask shuffle, this
28873 // is to avoid constant pool bloat.
28874 if (!OneUseConstantOp && !HasVariableMask)
28877 // Shuffle the constant bits according to the mask.
28878 APInt UndefElts(NumMaskElts, 0);
28879 APInt ZeroElts(NumMaskElts, 0);
28880 APInt ConstantElts(NumMaskElts, 0);
28881 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28882 APInt::getNullValue(MaskSizeInBits));
28883 for (unsigned i = 0; i != NumMaskElts; ++i) {
28885 if (M == SM_SentinelUndef) {
28886 UndefElts.setBit(i);
28888 } else if (M == SM_SentinelZero) {
28889 ZeroElts.setBit(i);
28892 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
28894 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28895 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28897 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28898 if (SrcUndefElts[SrcMaskIdx]) {
28899 UndefElts.setBit(i);
28903 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28904 APInt &Bits = SrcEltBits[SrcMaskIdx];
28906 ZeroElts.setBit(i);
28910 ConstantElts.setBit(i);
28911 ConstantBitData[i] = Bits;
28913 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
28915 // Create the constant data.
28917 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
28918 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28920 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28922 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28925 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28926 DCI.AddToWorklist(CstOp.getNode());
28927 return DAG.getBitcast(VT, CstOp);
28930 /// \brief Fully generic combining of x86 shuffle instructions.
28932 /// This should be the last combine run over the x86 shuffle instructions. Once
28933 /// they have been fully optimized, this will recursively consider all chains
28934 /// of single-use shuffle instructions, build a generic model of the cumulative
28935 /// shuffle operation, and check for simpler instructions which implement this
28936 /// operation. We use this primarily for two purposes:
28938 /// 1) Collapse generic shuffles to specialized single instructions when
28939 /// equivalent. In most cases, this is just an encoding size win, but
28940 /// sometimes we will collapse multiple generic shuffles into a single
28941 /// special-purpose shuffle.
28942 /// 2) Look for sequences of shuffle instructions with 3 or more total
28943 /// instructions, and replace them with the slightly more expensive SSSE3
28944 /// PSHUFB instruction if available. We do this as the last combining step
28945 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
28946 /// a suitable short sequence of other instructions. The PSHUFB will either
28947 /// use a register or have to read from memory and so is slightly (but only
28948 /// slightly) more expensive than the other shuffle instructions.
28950 /// Because this is inherently a quadratic operation (for each shuffle in
28951 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28952 /// This should never be an issue in practice as the shuffle lowering doesn't
28953 /// produce sequences of more than 8 instructions.
28955 /// FIXME: We will currently miss some cases where the redundant shuffling
28956 /// would simplify under the threshold for PSHUFB formation because of
28957 /// combine-ordering. To fix this, we should do the redundant instruction
28958 /// combining in this recursive walk.
28959 static SDValue combineX86ShufflesRecursively(
28960 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
28961 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
28962 bool HasVariableMask, SelectionDAG &DAG,
28963 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
28964 // Bound the depth of our recursive combine because this is ultimately
28965 // quadratic in nature.
28969 // Directly rip through bitcasts to find the underlying operand.
28970 SDValue Op = SrcOps[SrcOpIndex];
28971 Op = peekThroughOneUseBitcasts(Op);
28973 MVT VT = Op.getSimpleValueType();
28974 if (!VT.isVector())
28975 return SDValue(); // Bail if we hit a non-vector.
28977 assert(Root.getSimpleValueType().isVector() &&
28978 "Shuffles operate on vector types!");
28979 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
28980 "Can only combine shuffles of the same vector register size.");
28982 // Extract target shuffle mask and resolve sentinels and inputs.
28983 SmallVector<int, 64> OpMask;
28984 SmallVector<SDValue, 2> OpInputs;
28985 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28988 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
28989 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28990 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28992 // Add the inputs to the Ops list, avoiding duplicates.
28993 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28995 int InputIdx0 = -1, InputIdx1 = -1;
28996 for (int i = 0, e = Ops.size(); i < e; ++i) {
28997 SDValue BC = peekThroughBitcasts(Ops[i]);
28998 if (Input0 && BC == peekThroughBitcasts(Input0))
29000 if (Input1 && BC == peekThroughBitcasts(Input1))
29004 if (Input0 && InputIdx0 < 0) {
29005 InputIdx0 = SrcOpIndex;
29006 Ops[SrcOpIndex] = Input0;
29008 if (Input1 && InputIdx1 < 0) {
29009 InputIdx1 = Ops.size();
29010 Ops.push_back(Input1);
29013 assert(((RootMask.size() > OpMask.size() &&
29014 RootMask.size() % OpMask.size() == 0) ||
29015 (OpMask.size() > RootMask.size() &&
29016 OpMask.size() % RootMask.size() == 0) ||
29017 OpMask.size() == RootMask.size()) &&
29018 "The smaller number of elements must divide the larger.");
29020 // This function can be performance-critical, so we rely on the power-of-2
29021 // knowledge that we have about the mask sizes to replace div/rem ops with
29022 // bit-masks and shifts.
29023 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
29024 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
29025 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
29026 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
29028 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
29029 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
29030 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
29031 assert((RootRatio == 1 || OpRatio == 1) &&
29032 "Must not have a ratio for both incoming and op masks!");
29034 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
29035 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
29036 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
29037 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
29038 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
29040 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
29042 // Merge this shuffle operation's mask into our accumulated mask. Note that
29043 // this shuffle's mask will be the first applied to the input, followed by the
29044 // root mask to get us all the way to the root value arrangement. The reason
29045 // for this order is that we are recursing up the operation chain.
29046 for (unsigned i = 0; i < MaskWidth; ++i) {
29047 unsigned RootIdx = i >> RootRatioLog2;
29048 if (RootMask[RootIdx] < 0) {
29049 // This is a zero or undef lane, we're done.
29050 Mask[i] = RootMask[RootIdx];
29054 unsigned RootMaskedIdx =
29056 ? RootMask[RootIdx]
29057 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
29059 // Just insert the scaled root mask value if it references an input other
29060 // than the SrcOp we're currently inserting.
29061 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
29062 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
29063 Mask[i] = RootMaskedIdx;
29067 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
29068 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
29069 if (OpMask[OpIdx] < 0) {
29070 // The incoming lanes are zero or undef, it doesn't matter which ones we
29072 Mask[i] = OpMask[OpIdx];
29076 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
29077 unsigned OpMaskedIdx =
29080 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
29082 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
29083 if (OpMask[OpIdx] < (int)OpMask.size()) {
29084 assert(0 <= InputIdx0 && "Unknown target shuffle input");
29085 OpMaskedIdx += InputIdx0 * MaskWidth;
29087 assert(0 <= InputIdx1 && "Unknown target shuffle input");
29088 OpMaskedIdx += InputIdx1 * MaskWidth;
29091 Mask[i] = OpMaskedIdx;
29094 // Handle the all undef/zero cases early.
29095 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
29096 return DAG.getUNDEF(Root.getValueType());
29098 // TODO - should we handle the mixed zero/undef case as well? Just returning
29099 // a zero mask will lose information on undef elements possibly reducing
29100 // future combine possibilities.
29101 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
29102 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
29105 // Remove unused shuffle source ops.
29106 resolveTargetShuffleInputsAndMask(Ops, Mask);
29107 assert(!Ops.empty() && "Shuffle with no inputs detected");
29109 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
29111 // Update the list of shuffle nodes that have been combined so far.
29112 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
29114 CombinedNodes.push_back(Op.getNode());
29116 // See if we can recurse into each shuffle source op (if it's a target
29117 // shuffle). The source op should only be combined if it either has a
29118 // single use (i.e. current Op) or all its users have already been combined.
29119 for (int i = 0, e = Ops.size(); i < e; ++i)
29120 if (Ops[i].getNode()->hasOneUse() ||
29121 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
29122 if (SDValue Res = combineX86ShufflesRecursively(
29123 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
29124 DAG, DCI, Subtarget))
29127 // Attempt to constant fold all of the constant source ops.
29128 if (SDValue Cst = combineX86ShufflesConstants(
29129 Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
29132 // We can only combine unary and binary shuffle mask cases.
29133 if (Ops.size() > 2)
29136 // Minor canonicalization of the accumulated shuffle mask to make it easier
29137 // to match below. All this does is detect masks with sequential pairs of
29138 // elements, and shrink them to the half-width mask. It does this in a loop
29139 // so it will reduce the size of the mask to the minimal width mask which
29140 // performs an equivalent shuffle.
29141 SmallVector<int, 64> WidenedMask;
29142 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
29143 Mask = std::move(WidenedMask);
29146 // Canonicalization of binary shuffle masks to improve pattern matching by
29147 // commuting the inputs.
29148 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
29149 ShuffleVectorSDNode::commuteMask(Mask);
29150 std::swap(Ops[0], Ops[1]);
29153 // Finally, try to combine into a single shuffle instruction.
29154 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
29158 /// \brief Get the PSHUF-style mask from PSHUF node.
29160 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
29161 /// PSHUF-style masks that can be reused with such instructions.
29162 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
29163 MVT VT = N.getSimpleValueType();
29164 SmallVector<int, 4> Mask;
29165 SmallVector<SDValue, 2> Ops;
29168 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
29172 // If we have more than 128-bits, only the low 128-bits of shuffle mask
29173 // matter. Check that the upper masks are repeats and remove them.
29174 if (VT.getSizeInBits() > 128) {
29175 int LaneElts = 128 / VT.getScalarSizeInBits();
29177 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
29178 for (int j = 0; j < LaneElts; ++j)
29179 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
29180 "Mask doesn't repeat in high 128-bit lanes!");
29182 Mask.resize(LaneElts);
29185 switch (N.getOpcode()) {
29186 case X86ISD::PSHUFD:
29188 case X86ISD::PSHUFLW:
29191 case X86ISD::PSHUFHW:
29192 Mask.erase(Mask.begin(), Mask.begin() + 4);
29193 for (int &M : Mask)
29197 llvm_unreachable("No valid shuffle instruction found!");
29201 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
29203 /// We walk up the chain and look for a combinable shuffle, skipping over
29204 /// shuffles that we could hoist this shuffle's transformation past without
29205 /// altering anything.
29207 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
29208 SelectionDAG &DAG) {
29209 assert(N.getOpcode() == X86ISD::PSHUFD &&
29210 "Called with something other than an x86 128-bit half shuffle!");
29213 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
29214 // of the shuffles in the chain so that we can form a fresh chain to replace
29216 SmallVector<SDValue, 8> Chain;
29217 SDValue V = N.getOperand(0);
29218 for (; V.hasOneUse(); V = V.getOperand(0)) {
29219 switch (V.getOpcode()) {
29221 return SDValue(); // Nothing combined!
29224 // Skip bitcasts as we always know the type for the target specific
29228 case X86ISD::PSHUFD:
29229 // Found another dword shuffle.
29232 case X86ISD::PSHUFLW:
29233 // Check that the low words (being shuffled) are the identity in the
29234 // dword shuffle, and the high words are self-contained.
29235 if (Mask[0] != 0 || Mask[1] != 1 ||
29236 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
29239 Chain.push_back(V);
29242 case X86ISD::PSHUFHW:
29243 // Check that the high words (being shuffled) are the identity in the
29244 // dword shuffle, and the low words are self-contained.
29245 if (Mask[2] != 2 || Mask[3] != 3 ||
29246 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
29249 Chain.push_back(V);
29252 case X86ISD::UNPCKL:
29253 case X86ISD::UNPCKH:
29254 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
29255 // shuffle into a preceding word shuffle.
29256 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
29257 V.getSimpleValueType().getVectorElementType() != MVT::i16)
29260 // Search for a half-shuffle which we can combine with.
29261 unsigned CombineOp =
29262 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
29263 if (V.getOperand(0) != V.getOperand(1) ||
29264 !V->isOnlyUserOf(V.getOperand(0).getNode()))
29266 Chain.push_back(V);
29267 V = V.getOperand(0);
29269 switch (V.getOpcode()) {
29271 return SDValue(); // Nothing to combine.
29273 case X86ISD::PSHUFLW:
29274 case X86ISD::PSHUFHW:
29275 if (V.getOpcode() == CombineOp)
29278 Chain.push_back(V);
29282 V = V.getOperand(0);
29286 } while (V.hasOneUse());
29289 // Break out of the loop if we break out of the switch.
29293 if (!V.hasOneUse())
29294 // We fell out of the loop without finding a viable combining instruction.
29297 // Merge this node's mask and our incoming mask.
29298 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29299 for (int &M : Mask)
29301 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
29302 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29304 // Rebuild the chain around this new shuffle.
29305 while (!Chain.empty()) {
29306 SDValue W = Chain.pop_back_val();
29308 if (V.getValueType() != W.getOperand(0).getValueType())
29309 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
29311 switch (W.getOpcode()) {
29313 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
29315 case X86ISD::UNPCKL:
29316 case X86ISD::UNPCKH:
29317 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
29320 case X86ISD::PSHUFD:
29321 case X86ISD::PSHUFLW:
29322 case X86ISD::PSHUFHW:
29323 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
29327 if (V.getValueType() != N.getValueType())
29328 V = DAG.getBitcast(N.getValueType(), V);
29330 // Return the new chain to replace N.
29334 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
29337 /// We walk up the chain, skipping shuffles of the other half and looking
29338 /// through shuffles which switch halves trying to find a shuffle of the same
29339 /// pair of dwords.
29340 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
29342 TargetLowering::DAGCombinerInfo &DCI) {
29344 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
29345 "Called with something other than an x86 128-bit half shuffle!");
29347 unsigned CombineOpcode = N.getOpcode();
29349 // Walk up a single-use chain looking for a combinable shuffle.
29350 SDValue V = N.getOperand(0);
29351 for (; V.hasOneUse(); V = V.getOperand(0)) {
29352 switch (V.getOpcode()) {
29354 return false; // Nothing combined!
29357 // Skip bitcasts as we always know the type for the target specific
29361 case X86ISD::PSHUFLW:
29362 case X86ISD::PSHUFHW:
29363 if (V.getOpcode() == CombineOpcode)
29366 // Other-half shuffles are no-ops.
29369 // Break out of the loop if we break out of the switch.
29373 if (!V.hasOneUse())
29374 // We fell out of the loop without finding a viable combining instruction.
29377 // Combine away the bottom node as its shuffle will be accumulated into
29378 // a preceding shuffle.
29379 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29381 // Record the old value.
29384 // Merge this node's mask and our incoming mask (adjusted to account for all
29385 // the pshufd instructions encountered).
29386 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29387 for (int &M : Mask)
29389 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
29390 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29392 // Check that the shuffles didn't cancel each other out. If not, we need to
29393 // combine to the new one.
29395 // Replace the combinable shuffle with the combined one, updating all users
29396 // so that we re-evaluate the chain here.
29397 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
29402 /// \brief Try to combine x86 target specific shuffles.
29403 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
29404 TargetLowering::DAGCombinerInfo &DCI,
29405 const X86Subtarget &Subtarget) {
29407 MVT VT = N.getSimpleValueType();
29408 SmallVector<int, 4> Mask;
29409 unsigned Opcode = N.getOpcode();
29411 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
29412 // single instruction.
29413 if (VT.getScalarSizeInBits() == 64 &&
29414 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
29415 Opcode == X86ISD::UNPCKL)) {
29416 auto BC0 = peekThroughBitcasts(N.getOperand(0));
29417 auto BC1 = peekThroughBitcasts(N.getOperand(1));
29418 EVT VT0 = BC0.getValueType();
29419 EVT VT1 = BC1.getValueType();
29420 unsigned Opcode0 = BC0.getOpcode();
29421 unsigned Opcode1 = BC1.getOpcode();
29422 if (Opcode0 == Opcode1 && VT0 == VT1 &&
29423 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
29424 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
29425 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
29427 if (Opcode == X86ISD::MOVSD) {
29428 Lo = BC1.getOperand(0);
29429 Hi = BC0.getOperand(1);
29431 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29432 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29434 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
29435 DCI.AddToWorklist(Horiz.getNode());
29436 return DAG.getBitcast(VT, Horiz);
29441 case X86ISD::PSHUFD:
29442 case X86ISD::PSHUFLW:
29443 case X86ISD::PSHUFHW:
29444 Mask = getPSHUFShuffleMask(N);
29445 assert(Mask.size() == 4);
29447 case X86ISD::UNPCKL: {
29448 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
29449 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
29450 // moves upper half elements into the lower half part. For example:
29452 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
29454 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
29456 // will be combined to:
29458 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
29460 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
29461 // happen due to advanced instructions.
29462 if (!VT.is128BitVector())
29465 auto Op0 = N.getOperand(0);
29466 auto Op1 = N.getOperand(1);
29467 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
29468 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
29470 unsigned NumElts = VT.getVectorNumElements();
29471 SmallVector<int, 8> ExpectedMask(NumElts, -1);
29472 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
29475 auto ShufOp = Op1.getOperand(0);
29476 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
29477 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
29481 case X86ISD::BLENDI: {
29482 SDValue V0 = N->getOperand(0);
29483 SDValue V1 = N->getOperand(1);
29484 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
29485 "Unexpected input vector types");
29487 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
29488 // operands and changing the mask to 1. This saves us a bunch of
29489 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
29490 // x86InstrInfo knows how to commute this back after instruction selection
29491 // if it would help register allocation.
29493 // TODO: If optimizing for size or a processor that doesn't suffer from
29494 // partial register update stalls, this should be transformed into a MOVSD
29495 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
29497 if (VT == MVT::v2f64)
29498 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
29499 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
29500 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
29501 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
29506 case X86ISD::MOVSD:
29507 case X86ISD::MOVSS: {
29508 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
29509 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
29510 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
29511 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
29512 if (isZero0 && isZero1)
29515 // We often lower to MOVSD/MOVSS from integer as well as native float
29516 // types; remove unnecessary domain-crossing bitcasts if we can to make it
29517 // easier to combine shuffles later on. We've already accounted for the
29518 // domain switching cost when we decided to lower with it.
29519 bool isFloat = VT.isFloatingPoint();
29520 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
29521 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
29522 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
29523 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
29524 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
29525 V0 = DAG.getBitcast(NewVT, V0);
29526 V1 = DAG.getBitcast(NewVT, V1);
29527 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
29532 case X86ISD::INSERTPS: {
29533 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
29534 SDValue Op0 = N.getOperand(0);
29535 SDValue Op1 = N.getOperand(1);
29536 SDValue Op2 = N.getOperand(2);
29537 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
29538 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
29539 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
29540 unsigned ZeroMask = InsertPSMask & 0xF;
29542 // If we zero out all elements from Op0 then we don't need to reference it.
29543 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
29544 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
29545 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29547 // If we zero out the element from Op1 then we don't need to reference it.
29548 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
29549 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29550 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29552 // Attempt to merge insertps Op1 with an inner target shuffle node.
29553 SmallVector<int, 8> TargetMask1;
29554 SmallVector<SDValue, 2> Ops1;
29555 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
29556 int M = TargetMask1[SrcIdx];
29557 if (isUndefOrZero(M)) {
29558 // Zero/UNDEF insertion - zero out element and remove dependency.
29559 InsertPSMask |= (1u << DstIdx);
29560 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29561 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29563 // Update insertps mask srcidx and reference the source input directly.
29564 assert(0 <= M && M < 8 && "Shuffle index out of range");
29565 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
29566 Op1 = Ops1[M < 4 ? 0 : 1];
29567 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29568 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29571 // Attempt to merge insertps Op0 with an inner target shuffle node.
29572 SmallVector<int, 8> TargetMask0;
29573 SmallVector<SDValue, 2> Ops0;
29574 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
29577 bool Updated = false;
29578 bool UseInput00 = false;
29579 bool UseInput01 = false;
29580 for (int i = 0; i != 4; ++i) {
29581 int M = TargetMask0[i];
29582 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
29583 // No change if element is already zero or the inserted element.
29585 } else if (isUndefOrZero(M)) {
29586 // If the target mask is undef/zero then we must zero the element.
29587 InsertPSMask |= (1u << i);
29592 // The input vector element must be inline.
29593 if (M != i && M != (i + 4))
29596 // Determine which inputs of the target shuffle we're using.
29597 UseInput00 |= (0 <= M && M < 4);
29598 UseInput01 |= (4 <= M);
29601 // If we're not using both inputs of the target shuffle then use the
29602 // referenced input directly.
29603 if (UseInput00 && !UseInput01) {
29606 } else if (!UseInput00 && UseInput01) {
29612 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29613 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29621 // Nuke no-op shuffles that show up after combining.
29622 if (isNoopShuffleMask(Mask))
29623 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29625 // Look for simplifications involving one or two shuffle instructions.
29626 SDValue V = N.getOperand(0);
29627 switch (N.getOpcode()) {
29630 case X86ISD::PSHUFLW:
29631 case X86ISD::PSHUFHW:
29632 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
29634 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
29635 return SDValue(); // We combined away this shuffle, so we're done.
29637 // See if this reduces to a PSHUFD which is no more expensive and can
29638 // combine with more operations. Note that it has to at least flip the
29639 // dwords as otherwise it would have been removed as a no-op.
29640 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
29641 int DMask[] = {0, 1, 2, 3};
29642 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
29643 DMask[DOffset + 0] = DOffset + 1;
29644 DMask[DOffset + 1] = DOffset + 0;
29645 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29646 V = DAG.getBitcast(DVT, V);
29647 DCI.AddToWorklist(V.getNode());
29648 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
29649 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
29650 DCI.AddToWorklist(V.getNode());
29651 return DAG.getBitcast(VT, V);
29654 // Look for shuffle patterns which can be implemented as a single unpack.
29655 // FIXME: This doesn't handle the location of the PSHUFD generically, and
29656 // only works when we have a PSHUFD followed by two half-shuffles.
29657 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
29658 (V.getOpcode() == X86ISD::PSHUFLW ||
29659 V.getOpcode() == X86ISD::PSHUFHW) &&
29660 V.getOpcode() != N.getOpcode() &&
29662 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
29663 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
29664 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29665 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
29666 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
29667 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
29669 for (int i = 0; i < 4; ++i) {
29670 WordMask[i + NOffset] = Mask[i] + NOffset;
29671 WordMask[i + VOffset] = VMask[i] + VOffset;
29673 // Map the word mask through the DWord mask.
29675 for (int i = 0; i < 8; ++i)
29676 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
29677 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
29678 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
29679 // We can replace all three shuffles with an unpack.
29680 V = DAG.getBitcast(VT, D.getOperand(0));
29681 DCI.AddToWorklist(V.getNode());
29682 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
29691 case X86ISD::PSHUFD:
29692 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
29701 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
29702 /// operation. If true is returned then the operands of ADDSUB operation
29703 /// are written to the parameters \p Opnd0 and \p Opnd1.
29705 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
29706 /// so it is easier to generically match. We also insert dummy vector shuffle
29707 /// nodes for the operands which explicitly discard the lanes which are unused
29708 /// by this operation to try to flow through the rest of the combiner
29709 /// the fact that they're unused.
29710 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
29711 SDValue &Opnd0, SDValue &Opnd1) {
29713 EVT VT = N->getValueType(0);
29714 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
29715 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
29716 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
29719 // We only handle target-independent shuffles.
29720 // FIXME: It would be easy and harmless to use the target shuffle mask
29721 // extraction tool to support more.
29722 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
29725 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
29726 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
29728 SDValue V1 = N->getOperand(0);
29729 SDValue V2 = N->getOperand(1);
29731 // We require the first shuffle operand to be the FSUB node, and the second to
29732 // be the FADD node.
29733 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
29734 ShuffleVectorSDNode::commuteMask(Mask);
29736 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
29739 // If there are other uses of these operations we can't fold them.
29740 if (!V1->hasOneUse() || !V2->hasOneUse())
29743 // Ensure that both operations have the same operands. Note that we can
29744 // commute the FADD operands.
29745 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
29746 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
29747 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
29750 // We're looking for blends between FADD and FSUB nodes. We insist on these
29751 // nodes being lined up in a specific expected pattern.
29752 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
29753 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
29754 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
29755 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
29756 8, 25, 10, 27, 12, 29, 14, 31})))
29764 /// \brief Try to combine a shuffle into a target-specific add-sub or
29765 /// mul-add-sub node.
29766 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
29767 const X86Subtarget &Subtarget,
29768 SelectionDAG &DAG) {
29769 SDValue Opnd0, Opnd1;
29770 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
29773 EVT VT = N->getValueType(0);
29776 // Try to generate X86ISD::FMADDSUB node here.
29778 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
29779 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
29781 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
29782 // the ADDSUB idiom has been successfully recognized. There are no known
29783 // X86 targets with 512-bit ADDSUB instructions!
29784 if (VT.is512BitVector())
29787 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
29790 // We are looking for a shuffle where both sources are concatenated with undef
29791 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
29792 // if we can express this as a single-source shuffle, that's preferable.
29793 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
29794 const X86Subtarget &Subtarget) {
29795 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
29798 EVT VT = N->getValueType(0);
29800 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
29801 if (!VT.is128BitVector() && !VT.is256BitVector())
29804 if (VT.getVectorElementType() != MVT::i32 &&
29805 VT.getVectorElementType() != MVT::i64 &&
29806 VT.getVectorElementType() != MVT::f32 &&
29807 VT.getVectorElementType() != MVT::f64)
29810 SDValue N0 = N->getOperand(0);
29811 SDValue N1 = N->getOperand(1);
29813 // Check that both sources are concats with undef.
29814 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29815 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
29816 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
29817 !N1.getOperand(1).isUndef())
29820 // Construct the new shuffle mask. Elements from the first source retain their
29821 // index, but elements from the second source no longer need to skip an undef.
29822 SmallVector<int, 8> Mask;
29823 int NumElts = VT.getVectorNumElements();
29825 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29826 for (int Elt : SVOp->getMask())
29827 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
29830 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29832 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29835 /// Eliminate a redundant shuffle of a horizontal math op.
29836 static SDValue foldShuffleOfHorizOp(SDNode *N) {
29837 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
29840 SDValue HOp = N->getOperand(0);
29841 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
29842 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
29845 // 128-bit horizontal math instructions are defined to operate on adjacent
29846 // lanes of each operand as:
29847 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
29848 // ...similarly for v2f64 and v8i16.
29849 // TODO: 256-bit is not the same because...x86.
29850 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
29853 // When the operands of a horizontal math op are identical, the low half of
29854 // the result is the same as the high half. If the shuffle is also replicating
29855 // low and high halves, we don't need the shuffle.
29856 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
29857 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
29858 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
29859 // but this should be tied to whatever horizontal op matching and shuffle
29860 // canonicalization are producing.
29861 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
29862 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
29863 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
29869 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29870 TargetLowering::DAGCombinerInfo &DCI,
29871 const X86Subtarget &Subtarget) {
29873 EVT VT = N->getValueType(0);
29874 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29875 // If we have legalized the vector types, look for blends of FADD and FSUB
29876 // nodes that we can fuse into an ADDSUB node.
29877 if (TLI.isTypeLegal(VT)) {
29878 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29881 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
29885 // During Type Legalization, when promoting illegal vector types,
29886 // the backend might introduce new shuffle dag nodes and bitcasts.
29888 // This code performs the following transformation:
29889 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29890 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29892 // We do this only if both the bitcast and the BINOP dag nodes have
29893 // one use. Also, perform this transformation only if the new binary
29894 // operation is legal. This is to avoid introducing dag nodes that
29895 // potentially need to be further expanded (or custom lowered) into a
29896 // less optimal sequence of dag nodes.
29897 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
29898 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29899 N->getOperand(0).getOpcode() == ISD::BITCAST &&
29900 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
29901 SDValue N0 = N->getOperand(0);
29902 SDValue N1 = N->getOperand(1);
29904 SDValue BC0 = N0.getOperand(0);
29905 EVT SVT = BC0.getValueType();
29906 unsigned Opcode = BC0.getOpcode();
29907 unsigned NumElts = VT.getVectorNumElements();
29909 if (BC0.hasOneUse() && SVT.isVector() &&
29910 SVT.getVectorNumElements() * 2 == NumElts &&
29911 TLI.isOperationLegal(Opcode, VT)) {
29912 bool CanFold = false;
29918 // isOperationLegal lies for integer ops on floating point types.
29919 CanFold = VT.isInteger();
29924 // isOperationLegal lies for floating point ops on integer types.
29925 CanFold = VT.isFloatingPoint();
29929 unsigned SVTNumElts = SVT.getVectorNumElements();
29930 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29931 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
29932 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29933 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
29934 CanFold = SVOp->getMaskElt(i) < 0;
29937 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29938 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29939 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29940 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29945 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29946 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29947 // consecutive, non-overlapping, and in the right order.
29948 SmallVector<SDValue, 16> Elts;
29949 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
29950 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29951 Elts.push_back(Elt);
29958 if (Elts.size() == VT.getVectorNumElements())
29960 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29963 // For AVX2, we sometimes want to combine
29964 // (vector_shuffle <mask> (concat_vectors t1, undef)
29965 // (concat_vectors t2, undef))
29967 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29968 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29969 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29972 if (isTargetShuffle(N->getOpcode())) {
29974 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29977 // Try recursively combining arbitrary sequences of x86 shuffle
29978 // instructions into higher-order shuffles. We do this after combining
29979 // specific PSHUF instruction sequences into their minimal form so that we
29980 // can evaluate how many specialized shuffle instructions are involved in
29981 // a particular chain.
29982 if (SDValue Res = combineX86ShufflesRecursively(
29983 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
29984 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
29985 DCI.CombineTo(N, Res);
29993 /// Check if a vector extract from a target-specific shuffle of a load can be
29994 /// folded into a single element load.
29995 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29996 /// shuffles have been custom lowered so we need to handle those here.
29997 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29998 TargetLowering::DAGCombinerInfo &DCI) {
29999 if (DCI.isBeforeLegalizeOps())
30002 SDValue InVec = N->getOperand(0);
30003 SDValue EltNo = N->getOperand(1);
30004 EVT EltVT = N->getValueType(0);
30006 if (!isa<ConstantSDNode>(EltNo))
30009 EVT OriginalVT = InVec.getValueType();
30011 // Peek through bitcasts, don't duplicate a load with other uses.
30012 InVec = peekThroughOneUseBitcasts(InVec);
30014 EVT CurrentVT = InVec.getValueType();
30015 if (!CurrentVT.isVector() ||
30016 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
30019 if (!isTargetShuffle(InVec.getOpcode()))
30022 // Don't duplicate a load with other uses.
30023 if (!InVec.hasOneUse())
30026 SmallVector<int, 16> ShuffleMask;
30027 SmallVector<SDValue, 2> ShuffleOps;
30029 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
30030 ShuffleOps, ShuffleMask, UnaryShuffle))
30033 // Select the input vector, guarding against out of range extract vector.
30034 unsigned NumElems = CurrentVT.getVectorNumElements();
30035 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
30036 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
30038 if (Idx == SM_SentinelZero)
30039 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
30040 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
30041 if (Idx == SM_SentinelUndef)
30042 return DAG.getUNDEF(EltVT);
30044 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
30045 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
30048 // If inputs to shuffle are the same for both ops, then allow 2 uses
30049 unsigned AllowedUses =
30050 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
30052 if (LdNode.getOpcode() == ISD::BITCAST) {
30053 // Don't duplicate a load with other uses.
30054 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
30057 AllowedUses = 1; // only allow 1 load use if we have a bitcast
30058 LdNode = LdNode.getOperand(0);
30061 if (!ISD::isNormalLoad(LdNode.getNode()))
30064 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
30066 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
30069 // If there's a bitcast before the shuffle, check if the load type and
30070 // alignment is valid.
30071 unsigned Align = LN0->getAlignment();
30072 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30073 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
30074 EltVT.getTypeForEVT(*DAG.getContext()));
30076 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
30079 // All checks match so transform back to vector_shuffle so that DAG combiner
30080 // can finish the job
30083 // Create shuffle node taking into account the case that its a unary shuffle
30084 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
30085 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
30087 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
30088 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
30092 // Try to match patterns such as
30093 // (i16 bitcast (v16i1 x))
30095 // (i16 movmsk (16i8 sext (v16i1 x)))
30096 // before the illegal vector is scalarized on subtargets that don't have legal
30098 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
30099 const X86Subtarget &Subtarget) {
30100 EVT VT = BitCast.getValueType();
30101 SDValue N0 = BitCast.getOperand(0);
30102 EVT VecVT = N0->getValueType(0);
30104 if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
30105 N0->getOpcode() == ISD::OR) {
30106 SDValue Op0 = N0->getOperand(0);
30107 SDValue Op1 = N0->getOperand(1);
30110 switch (VT.getSimpleVT().SimpleTy) {
30114 TrunckVT = MVT::i8;
30115 BitcastVT = MVT::v8i1;
30118 TrunckVT = MVT::i16;
30119 BitcastVT = MVT::v16i1;
30122 TrunckVT = MVT::i32;
30123 BitcastVT = MVT::v32i1;
30126 bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
30127 bool isArg0UndefLeft =
30128 Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
30129 bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
30130 bool isArg1UndefLeft =
30131 Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
30134 if (isArg0UndefRight && isArg1UndefLeft) {
30137 } else if (isArg1UndefRight && isArg0UndefLeft) {
30143 SDValue Shr = OpLeft->getOperand(0);
30144 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
30145 SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
30146 SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
30147 SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
30148 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
30151 if (!VT.isScalarInteger() || !VecVT.isSimple())
30154 // With AVX512 vxi1 types are legal and we prefer using k-regs.
30155 // MOVMSK is supported in SSE2 or later.
30156 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
30159 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
30160 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
30161 // v8i16 and v16i16.
30162 // For these two cases, we can shuffle the upper element bytes to a
30163 // consecutive sequence at the start of the vector and treat the results as
30164 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
30165 // for v16i16 this is not the case, because the shuffle is expensive, so we
30166 // avoid sign-extending to this type entirely.
30167 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
30168 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
30170 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
30171 switch (VecVT.getSimpleVT().SimpleTy) {
30175 SExtVT = MVT::v2i64;
30176 FPCastVT = MVT::v2f64;
30179 SExtVT = MVT::v4i32;
30180 FPCastVT = MVT::v4f32;
30181 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
30182 // sign-extend to a 256-bit operation to avoid truncation.
30183 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30184 N0->getOperand(0)->getValueType(0).is256BitVector()) {
30185 SExtVT = MVT::v4i64;
30186 FPCastVT = MVT::v4f64;
30190 SExtVT = MVT::v8i16;
30191 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
30192 // sign-extend to a 256-bit operation to match the compare.
30193 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
30194 // 256-bit because the shuffle is cheaper than sign extending the result of
30196 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30197 (N0->getOperand(0)->getValueType(0).is256BitVector() ||
30198 N0->getOperand(0)->getValueType(0).is512BitVector())) {
30199 SExtVT = MVT::v8i32;
30200 FPCastVT = MVT::v8f32;
30204 SExtVT = MVT::v16i8;
30205 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
30206 // it is not profitable to sign-extend to 256-bit because this will
30207 // require an extra cross-lane shuffle which is more expensive than
30208 // truncating the result of the compare to 128-bits.
30211 SExtVT = MVT::v32i8;
30216 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
30218 if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30219 // Handle pre-AVX2 cases by splitting to two v16i1's.
30220 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30221 MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
30222 SDValue Lo = extract128BitVector(V, 0, DAG, DL);
30223 SDValue Hi = extract128BitVector(V, 16, DAG, DL);
30224 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30225 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30226 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30227 DAG.getConstant(16, DL, ShiftTy));
30228 V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30229 return DAG.getZExtOrTrunc(V, DL, VT);
30232 if (SExtVT == MVT::v8i16) {
30233 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
30234 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
30235 DAG.getUNDEF(MVT::v8i16));
30237 assert(SExtVT.getScalarType() != MVT::i16 &&
30238 "Vectors of i16 must be packed");
30239 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
30240 V = DAG.getBitcast(FPCastVT, V);
30241 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30242 return DAG.getZExtOrTrunc(V, DL, VT);
30245 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
30246 TargetLowering::DAGCombinerInfo &DCI,
30247 const X86Subtarget &Subtarget) {
30248 SDValue N0 = N->getOperand(0);
30249 EVT VT = N->getValueType(0);
30250 EVT SrcVT = N0.getValueType();
30252 // Try to match patterns such as
30253 // (i16 bitcast (v16i1 x))
30255 // (i16 movmsk (16i8 sext (v16i1 x)))
30256 // before the setcc result is scalarized on subtargets that don't have legal
30258 if (DCI.isBeforeLegalize())
30259 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
30261 // Since MMX types are special and don't usually play with other vector types,
30262 // it's better to handle them early to be sure we emit efficient code by
30263 // avoiding store-load conversions.
30265 // Detect bitcasts between i32 to x86mmx low word.
30266 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
30267 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
30268 SDValue N00 = N0->getOperand(0);
30269 if (N00.getValueType() == MVT::i32)
30270 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
30273 // Detect bitcasts between element or subvector extraction to x86mmx.
30274 if (VT == MVT::x86mmx &&
30275 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
30276 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
30277 isNullConstant(N0.getOperand(1))) {
30278 SDValue N00 = N0->getOperand(0);
30279 if (N00.getValueType().is128BitVector())
30280 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
30281 DAG.getBitcast(MVT::v2i64, N00));
30284 // Detect bitcasts from FP_TO_SINT to x86mmx.
30285 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
30286 N0.getOpcode() == ISD::FP_TO_SINT) {
30288 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
30289 DAG.getUNDEF(MVT::v2i32));
30290 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
30291 DAG.getBitcast(MVT::v2i64, Res));
30294 // Convert a bitcasted integer logic operation that has one bitcasted
30295 // floating-point operand into a floating-point logic operation. This may
30296 // create a load of a constant, but that is cheaper than materializing the
30297 // constant in an integer register and transferring it to an SSE register or
30298 // transferring the SSE operand to integer register and back.
30300 switch (N0.getOpcode()) {
30301 case ISD::AND: FPOpcode = X86ISD::FAND; break;
30302 case ISD::OR: FPOpcode = X86ISD::FOR; break;
30303 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
30304 default: return SDValue();
30307 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
30308 (Subtarget.hasSSE2() && VT == MVT::f64)))
30311 SDValue LogicOp0 = N0.getOperand(0);
30312 SDValue LogicOp1 = N0.getOperand(1);
30315 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
30316 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
30317 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
30318 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
30319 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
30320 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
30322 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
30323 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
30324 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
30325 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
30326 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
30327 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
30333 // Match a binop + shuffle pyramid that represents a horizontal reduction over
30334 // the elements of a vector.
30335 // Returns the vector that is being reduced on, or SDValue() if a reduction
30336 // was not matched.
30337 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
30338 ArrayRef<ISD::NodeType> CandidateBinOps) {
30339 // The pattern must end in an extract from index 0.
30340 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
30341 !isNullConstant(Extract->getOperand(1)))
30344 SDValue Op = Extract->getOperand(0);
30345 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
30347 // Match against one of the candidate binary ops.
30348 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
30349 return Op.getOpcode() == unsigned(BinOp);
30353 // At each stage, we're looking for something that looks like:
30354 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
30355 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
30356 // i32 undef, i32 undef, i32 undef, i32 undef>
30357 // %a = binop <8 x i32> %op, %s
30358 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
30359 // we expect something like:
30360 // <4,5,6,7,u,u,u,u>
30361 // <2,3,u,u,u,u,u,u>
30362 // <1,u,u,u,u,u,u,u>
30363 unsigned CandidateBinOp = Op.getOpcode();
30364 for (unsigned i = 0; i < Stages; ++i) {
30365 if (Op.getOpcode() != CandidateBinOp)
30368 ShuffleVectorSDNode *Shuffle =
30369 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
30371 Op = Op.getOperand(1);
30373 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
30374 Op = Op.getOperand(0);
30377 // The first operand of the shuffle should be the same as the other operand
30379 if (!Shuffle || Shuffle->getOperand(0) != Op)
30382 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
30383 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
30384 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
30388 BinOp = CandidateBinOp;
30392 // Given a select, detect the following pattern:
30393 // 1: %2 = zext <N x i8> %0 to <N x i32>
30394 // 2: %3 = zext <N x i8> %1 to <N x i32>
30395 // 3: %4 = sub nsw <N x i32> %2, %3
30396 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30397 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
30398 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30399 // This is useful as it is the input into a SAD pattern.
30400 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
30402 // Check the condition of the select instruction is greater-than.
30403 SDValue SetCC = Select->getOperand(0);
30404 if (SetCC.getOpcode() != ISD::SETCC)
30406 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30407 if (CC != ISD::SETGT && CC != ISD::SETLT)
30410 SDValue SelectOp1 = Select->getOperand(1);
30411 SDValue SelectOp2 = Select->getOperand(2);
30413 // The following instructions assume SelectOp1 is the subtraction operand
30414 // and SelectOp2 is the negation operand.
30415 // In the case of SETLT this is the other way around.
30416 if (CC == ISD::SETLT)
30417 std::swap(SelectOp1, SelectOp2);
30419 // The second operand of the select should be the negation of the first
30420 // operand, which is implemented as 0 - SelectOp1.
30421 if (!(SelectOp2.getOpcode() == ISD::SUB &&
30422 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
30423 SelectOp2.getOperand(1) == SelectOp1))
30426 // The first operand of SetCC is the first operand of the select, which is the
30427 // difference between the two input vectors.
30428 if (SetCC.getOperand(0) != SelectOp1)
30431 // In SetLT case, The second operand of the comparison can be either 1 or 0.
30433 if ((CC == ISD::SETLT) &&
30434 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
30435 SplatVal.isOneValue()) ||
30436 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
30439 // In SetGT case, The second operand of the comparison can be either -1 or 0.
30440 if ((CC == ISD::SETGT) &&
30441 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30442 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30445 // The first operand of the select is the difference between the two input
30447 if (SelectOp1.getOpcode() != ISD::SUB)
30450 Op0 = SelectOp1.getOperand(0);
30451 Op1 = SelectOp1.getOperand(1);
30453 // Check if the operands of the sub are zero-extended from vectors of i8.
30454 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30455 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30456 Op1.getOpcode() != ISD::ZERO_EXTEND ||
30457 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30463 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
30465 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
30466 const SDValue &Zext1, const SDLoc &DL) {
30468 // Find the appropriate width for the PSADBW.
30469 EVT InVT = Zext0.getOperand(0).getValueType();
30470 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
30472 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
30473 // fill in the missing vector elements with 0.
30474 unsigned NumConcat = RegSize / InVT.getSizeInBits();
30475 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30476 Ops[0] = Zext0.getOperand(0);
30477 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30478 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30479 Ops[0] = Zext1.getOperand(0);
30480 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30482 // Actually build the SAD
30483 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30484 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
30487 // Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
30488 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
30489 const X86Subtarget &Subtarget) {
30490 // Bail without SSE41.
30491 if (!Subtarget.hasSSE41())
30494 EVT ExtractVT = Extract->getValueType(0);
30495 if (ExtractVT != MVT::i16)
30498 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
30500 SDValue Src = matchBinOpReduction(
30501 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
30505 EVT SrcVT = Src.getValueType();
30506 EVT SrcSVT = SrcVT.getScalarType();
30507 if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
30511 SDValue MinPos = Src;
30513 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
30514 while (SrcVT.getSizeInBits() > 128) {
30515 unsigned NumElts = SrcVT.getVectorNumElements();
30516 unsigned NumSubElts = NumElts / 2;
30517 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
30518 unsigned SubSizeInBits = SrcVT.getSizeInBits();
30519 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
30520 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
30521 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
30523 assert(SrcVT == MVT::v8i16 && "Unexpected value type");
30525 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
30526 // to flip the value accordingly.
30528 if (BinOp == ISD::SMAX)
30529 Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
30530 else if (BinOp == ISD::SMIN)
30531 Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
30532 else if (BinOp == ISD::UMAX)
30533 Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);
30536 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30538 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);
30541 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30543 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
30544 DAG.getIntPtrConstant(0, DL));
30547 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
30548 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
30550 const X86Subtarget &Subtarget) {
30551 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
30552 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
30555 EVT ExtractVT = Extract->getValueType(0);
30556 unsigned BitWidth = ExtractVT.getSizeInBits();
30557 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
30558 ExtractVT != MVT::i8)
30561 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
30562 unsigned BinOp = 0;
30563 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
30567 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
30568 // which we can't support here for now.
30569 if (Match.getScalarValueSizeInBits() != BitWidth)
30572 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
30573 unsigned MatchSizeInBits = Match.getValueSizeInBits();
30574 if (!(MatchSizeInBits == 128 ||
30575 (MatchSizeInBits == 256 &&
30576 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
30579 // Don't bother performing this for 2-element vectors.
30580 if (Match.getValueType().getVectorNumElements() <= 2)
30583 // Check that we are extracting a reduction of all sign bits.
30584 if (DAG.ComputeNumSignBits(Match) != BitWidth)
30587 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
30589 if (64 == BitWidth || 32 == BitWidth)
30590 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
30591 MatchSizeInBits / BitWidth);
30593 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
30596 ISD::CondCode CondCode;
30597 if (BinOp == ISD::OR) {
30598 // any_of -> MOVMSK != 0
30599 CompareBits = APInt::getNullValue(32);
30600 CondCode = ISD::CondCode::SETNE;
30602 // all_of -> MOVMSK == ((1 << NumElts) - 1)
30603 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
30604 CondCode = ISD::CondCode::SETEQ;
30607 // Perform the select as i32/i64 and then truncate to avoid partial register
30609 unsigned ResWidth = std::max(BitWidth, 32u);
30610 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
30612 SDValue Zero = DAG.getConstant(0, DL, ResVT);
30613 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
30614 SDValue Res = DAG.getBitcast(MaskVT, Match);
30615 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
30616 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
30617 Ones, Zero, CondCode);
30618 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
30621 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
30622 const X86Subtarget &Subtarget) {
30623 // PSADBW is only supported on SSE2 and up.
30624 if (!Subtarget.hasSSE2())
30627 // Verify the type we're extracting from is any integer type above i16.
30628 EVT VT = Extract->getOperand(0).getValueType();
30629 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
30632 unsigned RegSize = 128;
30633 if (Subtarget.hasBWI())
30635 else if (Subtarget.hasAVX2())
30638 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
30639 // TODO: We should be able to handle larger vectors by splitting them before
30640 // feeding them into several SADs, and then reducing over those.
30641 if (RegSize / VT.getVectorNumElements() < 8)
30644 // Match shuffle + add pyramid.
30645 unsigned BinOp = 0;
30646 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
30648 // The operand is expected to be zero extended from i8
30649 // (verified in detectZextAbsDiff).
30650 // In order to convert to i64 and above, additional any/zero/sign
30651 // extend is expected.
30652 // The zero extend from 32 bit has no mathematical effect on the result.
30653 // Also the sign extend is basically zero extend
30654 // (extends the sign bit which is zero).
30655 // So it is correct to skip the sign/zero extend instruction.
30656 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
30657 Root.getOpcode() == ISD::ZERO_EXTEND ||
30658 Root.getOpcode() == ISD::ANY_EXTEND))
30659 Root = Root.getOperand(0);
30661 // If there was a match, we want Root to be a select that is the root of an
30662 // abs-diff pattern.
30663 if (!Root || (Root.getOpcode() != ISD::VSELECT))
30666 // Check whether we have an abs-diff pattern feeding into the select.
30667 SDValue Zext0, Zext1;
30668 if (!detectZextAbsDiff(Root, Zext0, Zext1))
30671 // Create the SAD instruction.
30673 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
30675 // If the original vector was wider than 8 elements, sum over the results
30676 // in the SAD vector.
30677 unsigned Stages = Log2_32(VT.getVectorNumElements());
30678 MVT SadVT = SAD.getSimpleValueType();
30680 unsigned SadElems = SadVT.getVectorNumElements();
30682 for(unsigned i = Stages - 3; i > 0; --i) {
30683 SmallVector<int, 16> Mask(SadElems, -1);
30684 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
30685 Mask[j] = MaskEnd + j;
30688 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
30689 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
30693 MVT Type = Extract->getSimpleValueType(0);
30694 unsigned TypeSizeInBits = Type.getSizeInBits();
30695 // Return the lowest TypeSizeInBits bits.
30696 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
30697 SAD = DAG.getBitcast(ResVT, SAD);
30698 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
30699 Extract->getOperand(1));
30702 // Attempt to peek through a target shuffle and extract the scalar from the
30704 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
30705 TargetLowering::DAGCombinerInfo &DCI,
30706 const X86Subtarget &Subtarget) {
30707 if (DCI.isBeforeLegalizeOps())
30710 SDValue Src = N->getOperand(0);
30711 SDValue Idx = N->getOperand(1);
30713 EVT VT = N->getValueType(0);
30714 EVT SrcVT = Src.getValueType();
30715 EVT SrcSVT = SrcVT.getVectorElementType();
30716 unsigned NumSrcElts = SrcVT.getVectorNumElements();
30718 // Don't attempt this for boolean mask vectors or unknown extraction indices.
30719 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
30722 // Resolve the target shuffle inputs and mask.
30723 SmallVector<int, 16> Mask;
30724 SmallVector<SDValue, 2> Ops;
30725 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
30728 // Attempt to narrow/widen the shuffle mask to the correct size.
30729 if (Mask.size() != NumSrcElts) {
30730 if ((NumSrcElts % Mask.size()) == 0) {
30731 SmallVector<int, 16> ScaledMask;
30732 int Scale = NumSrcElts / Mask.size();
30733 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
30734 Mask = std::move(ScaledMask);
30735 } else if ((Mask.size() % NumSrcElts) == 0) {
30736 SmallVector<int, 16> WidenedMask;
30737 while (Mask.size() > NumSrcElts &&
30738 canWidenShuffleElements(Mask, WidenedMask))
30739 Mask = std::move(WidenedMask);
30740 // TODO - investigate support for wider shuffle masks with known upper
30741 // undef/zero elements for implicit zero-extension.
30745 // Check if narrowing/widening failed.
30746 if (Mask.size() != NumSrcElts)
30749 int SrcIdx = Mask[N->getConstantOperandVal(1)];
30752 // If the shuffle source element is undef/zero then we can just accept it.
30753 if (SrcIdx == SM_SentinelUndef)
30754 return DAG.getUNDEF(VT);
30756 if (SrcIdx == SM_SentinelZero)
30757 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
30758 : DAG.getConstant(0, dl, VT);
30760 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
30761 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
30762 SrcIdx = SrcIdx % Mask.size();
30764 // We can only extract other elements from 128-bit vectors and in certain
30765 // circumstances, depending on SSE-level.
30766 // TODO: Investigate using extract_subvector for larger vectors.
30767 // TODO: Investigate float/double extraction if it will be just stored.
30768 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
30769 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
30770 assert(SrcSVT == VT && "Unexpected extraction type");
30771 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
30772 DAG.getIntPtrConstant(SrcIdx, dl));
30775 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
30776 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
30777 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
30778 "Unexpected extraction type");
30779 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
30780 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
30781 DAG.getIntPtrConstant(SrcIdx, dl));
30782 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
30788 /// Detect vector gather/scatter index generation and convert it from being a
30789 /// bunch of shuffles and extracts into a somewhat faster sequence.
30790 /// For i686, the best sequence is apparently storing the value and loading
30791 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
30792 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
30793 TargetLowering::DAGCombinerInfo &DCI,
30794 const X86Subtarget &Subtarget) {
30795 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
30798 // TODO - Remove this once we can handle the implicit zero-extension of
30799 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
30800 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
30801 // combineBasicSADPattern.
30802 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
30805 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
30808 SDValue InputVector = N->getOperand(0);
30809 SDValue EltIdx = N->getOperand(1);
30811 EVT SrcVT = InputVector.getValueType();
30812 EVT VT = N->getValueType(0);
30813 SDLoc dl(InputVector);
30815 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
30816 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
30817 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
30818 SDValue MMXSrc = InputVector.getOperand(0);
30820 // The bitcast source is a direct mmx result.
30821 if (MMXSrc.getValueType() == MVT::x86mmx)
30822 return DAG.getBitcast(VT, InputVector);
30825 // Detect mmx to i32 conversion through a v2i32 elt extract.
30826 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
30827 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
30828 SDValue MMXSrc = InputVector.getOperand(0);
30830 // The bitcast source is a direct mmx result.
30831 if (MMXSrc.getValueType() == MVT::x86mmx)
30832 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
30835 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
30836 isa<ConstantSDNode>(EltIdx) &&
30837 isa<ConstantSDNode>(InputVector.getOperand(0))) {
30838 uint64_t ExtractedElt = N->getConstantOperandVal(1);
30839 uint64_t InputValue = InputVector.getConstantOperandVal(0);
30840 uint64_t Res = (InputValue >> ExtractedElt) & 1;
30841 return DAG.getConstant(Res, dl, MVT::i1);
30844 // Check whether this extract is the root of a sum of absolute differences
30845 // pattern. This has to be done here because we really want it to happen
30846 // pre-legalization,
30847 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
30850 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
30851 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
30854 // Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
30855 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
30858 // Only operate on vectors of 4 elements, where the alternative shuffling
30859 // gets to be more expensive.
30860 if (SrcVT != MVT::v4i32)
30863 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
30864 // single use which is a sign-extend or zero-extend, and all elements are
30866 SmallVector<SDNode *, 4> Uses;
30867 unsigned ExtractedElements = 0;
30868 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
30869 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
30870 if (UI.getUse().getResNo() != InputVector.getResNo())
30873 SDNode *Extract = *UI;
30874 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
30877 if (Extract->getValueType(0) != MVT::i32)
30879 if (!Extract->hasOneUse())
30881 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
30882 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
30884 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
30887 // Record which element was extracted.
30888 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
30889 Uses.push_back(Extract);
30892 // If not all the elements were used, this may not be worthwhile.
30893 if (ExtractedElements != 15)
30896 // Ok, we've now decided to do the transformation.
30897 // If 64-bit shifts are legal, use the extract-shift sequence,
30898 // otherwise bounce the vector off the cache.
30899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30902 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
30903 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
30904 auto &DL = DAG.getDataLayout();
30905 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
30906 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
30907 DAG.getConstant(0, dl, VecIdxTy));
30908 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
30909 DAG.getConstant(1, dl, VecIdxTy));
30911 SDValue ShAmt = DAG.getConstant(
30912 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
30913 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
30914 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
30915 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
30916 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
30917 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
30918 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
30920 // Store the value to a temporary stack slot.
30921 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
30922 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
30923 MachinePointerInfo());
30925 EVT ElementType = SrcVT.getVectorElementType();
30926 unsigned EltSize = ElementType.getSizeInBits() / 8;
30928 // Replace each use (extract) with a load of the appropriate element.
30929 for (unsigned i = 0; i < 4; ++i) {
30930 uint64_t Offset = EltSize * i;
30931 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
30932 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
30934 SDValue ScalarAddr =
30935 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
30937 // Load the scalar.
30939 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
30943 // Replace the extracts
30944 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
30945 UE = Uses.end(); UI != UE; ++UI) {
30946 SDNode *Extract = *UI;
30948 uint64_t IdxVal = Extract->getConstantOperandVal(1);
30949 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
30952 // The replacement was made in place; don't return anything.
30956 /// If a vector select has an operand that is -1 or 0, try to simplify the
30957 /// select to a bitwise logic operation.
30958 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
30960 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
30961 TargetLowering::DAGCombinerInfo &DCI,
30962 const X86Subtarget &Subtarget) {
30963 SDValue Cond = N->getOperand(0);
30964 SDValue LHS = N->getOperand(1);
30965 SDValue RHS = N->getOperand(2);
30966 EVT VT = LHS.getValueType();
30967 EVT CondVT = Cond.getValueType();
30969 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30971 if (N->getOpcode() != ISD::VSELECT)
30974 assert(CondVT.isVector() && "Vector select expects a vector selector!");
30976 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30977 // Check if the first operand is all zeros and Cond type is vXi1.
30978 // This situation only applies to avx512.
30979 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
30980 CondVT.getVectorElementType() == MVT::i1) {
30981 // Invert the cond to not(cond) : xor(op,allones)=not(op)
30982 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30983 DAG.getAllOnesConstant(DL, CondVT));
30984 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30985 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30988 // To use the condition operand as a bitwise mask, it must have elements that
30989 // are the same size as the select elements. Ie, the condition operand must
30990 // have already been promoted from the IR select condition type <N x i1>.
30991 // Don't check if the types themselves are equal because that excludes
30992 // vector floating-point selects.
30993 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
30996 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30997 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30999 // Try to invert the condition if true value is not all 1s and false value is
31001 if (!TValIsAllOnes && !FValIsAllZeros &&
31002 // Check if the selector will be produced by CMPP*/PCMP*.
31003 Cond.getOpcode() == ISD::SETCC &&
31004 // Check if SETCC has already been promoted.
31005 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
31007 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
31009 if (TValIsAllZeros || FValIsAllOnes) {
31010 SDValue CC = Cond.getOperand(2);
31011 ISD::CondCode NewCC =
31012 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
31013 Cond.getOperand(0).getValueType().isInteger());
31014 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
31016 std::swap(LHS, RHS);
31017 TValIsAllOnes = FValIsAllOnes;
31018 FValIsAllZeros = TValIsAllZeros;
31022 // Cond value must be 'sign splat' to be converted to a logical op.
31023 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
31026 // vselect Cond, 111..., 000... -> Cond
31027 if (TValIsAllOnes && FValIsAllZeros)
31028 return DAG.getBitcast(VT, Cond);
31030 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
31033 // vselect Cond, 111..., X -> or Cond, X
31034 if (TValIsAllOnes) {
31035 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
31036 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
31037 return DAG.getBitcast(VT, Or);
31040 // vselect Cond, X, 000... -> and Cond, X
31041 if (FValIsAllZeros) {
31042 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
31043 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
31044 return DAG.getBitcast(VT, And);
31047 // vselect Cond, 000..., X -> andn Cond, X
31048 if (TValIsAllZeros) {
31049 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
31050 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
31051 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
31052 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
31053 return DAG.getBitcast(VT, AndN);
31059 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
31060 SDValue Cond = N->getOperand(0);
31061 SDValue LHS = N->getOperand(1);
31062 SDValue RHS = N->getOperand(2);
31065 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
31066 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
31067 if (!TrueC || !FalseC)
31070 // Don't do this for crazy integer types.
31071 EVT VT = N->getValueType(0);
31072 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31075 // We're going to use the condition bit in math or logic ops. We could allow
31076 // this with a wider condition value (post-legalization it becomes an i8),
31077 // but if nothing is creating selects that late, it doesn't matter.
31078 if (Cond.getValueType() != MVT::i1)
31081 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
31082 // 3, 5, or 9 with i32/i64, so those get transformed too.
31083 // TODO: For constants that overflow or do not differ by power-of-2 or small
31084 // multiplier, convert to 'and' + 'add'.
31085 const APInt &TrueVal = TrueC->getAPIntValue();
31086 const APInt &FalseVal = FalseC->getAPIntValue();
31088 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
31092 APInt AbsDiff = Diff.abs();
31093 if (AbsDiff.isPowerOf2() ||
31094 ((VT == MVT::i32 || VT == MVT::i64) &&
31095 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
31097 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
31098 // of the condition can usually be folded into a compare predicate, but even
31099 // without that, the sequence should be cheaper than a CMOV alternative.
31100 if (TrueVal.slt(FalseVal)) {
31101 Cond = DAG.getNOT(DL, Cond, MVT::i1);
31102 std::swap(TrueC, FalseC);
31105 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
31106 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
31108 // Multiply condition by the difference if non-one.
31109 if (!AbsDiff.isOneValue())
31110 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
31112 // Add the base if non-zero.
31113 if (!FalseC->isNullValue())
31114 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
31122 // If this is a bitcasted op that can be represented as another type, push the
31123 // the bitcast to the inputs. This allows more opportunities for pattern
31124 // matching masked instructions. This is called when we know that the operation
31125 // is used as one of the inputs of a vselect.
31126 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
31127 TargetLowering::DAGCombinerInfo &DCI) {
31128 // Make sure we have a bitcast.
31129 if (OrigOp.getOpcode() != ISD::BITCAST)
31132 SDValue Op = OrigOp.getOperand(0);
31134 // If the operation is used by anything other than the bitcast, we shouldn't
31135 // do this combine as that would replicate the operation.
31136 if (!Op.hasOneUse())
31139 MVT VT = OrigOp.getSimpleValueType();
31140 MVT EltVT = VT.getVectorElementType();
31141 SDLoc DL(Op.getNode());
31143 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
31145 Op0 = DAG.getBitcast(VT, Op0);
31146 DCI.AddToWorklist(Op0.getNode());
31147 Op1 = DAG.getBitcast(VT, Op1);
31148 DCI.AddToWorklist(Op1.getNode());
31149 DCI.CombineTo(OrigOp.getNode(),
31150 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
31154 unsigned Opcode = Op.getOpcode();
31156 case X86ISD::SHUF128: {
31157 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
31159 // Only change element size, not type.
31160 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31162 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
31165 case X86ISD::SUBV_BROADCAST: {
31166 unsigned EltSize = EltVT.getSizeInBits();
31167 if (EltSize != 32 && EltSize != 64)
31169 // Only change element size, not type.
31170 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31172 SDValue Op0 = Op.getOperand(0);
31173 MVT Op0VT = MVT::getVectorVT(EltVT,
31174 Op0.getSimpleValueType().getSizeInBits() / EltSize);
31175 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
31176 DCI.AddToWorklist(Op0.getNode());
31177 DCI.CombineTo(OrigOp.getNode(),
31178 DAG.getNode(Opcode, DL, VT, Op0));
31186 /// Do target-specific dag combines on SELECT and VSELECT nodes.
31187 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
31188 TargetLowering::DAGCombinerInfo &DCI,
31189 const X86Subtarget &Subtarget) {
31191 SDValue Cond = N->getOperand(0);
31192 // Get the LHS/RHS of the select.
31193 SDValue LHS = N->getOperand(1);
31194 SDValue RHS = N->getOperand(2);
31195 EVT VT = LHS.getValueType();
31196 EVT CondVT = Cond.getValueType();
31197 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31199 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
31200 // instructions match the semantics of the common C idiom x<y?x:y but not
31201 // x<=y?x:y, because of how they handle negative zero (which can be
31202 // ignored in unsafe-math mode).
31203 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
31204 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
31205 VT != MVT::f80 && VT != MVT::f128 &&
31206 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
31207 (Subtarget.hasSSE2() ||
31208 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
31209 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31211 unsigned Opcode = 0;
31212 // Check for x CC y ? x : y.
31213 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31214 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31218 // Converting this to a min would handle NaNs incorrectly, and swapping
31219 // the operands would cause it to handle comparisons between positive
31220 // and negative zero incorrectly.
31221 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31222 if (!DAG.getTarget().Options.UnsafeFPMath &&
31223 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31225 std::swap(LHS, RHS);
31227 Opcode = X86ISD::FMIN;
31230 // Converting this to a min would handle comparisons between positive
31231 // and negative zero incorrectly.
31232 if (!DAG.getTarget().Options.UnsafeFPMath &&
31233 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31235 Opcode = X86ISD::FMIN;
31238 // Converting this to a min would handle both negative zeros and NaNs
31239 // incorrectly, but we can swap the operands to fix both.
31240 std::swap(LHS, RHS);
31245 Opcode = X86ISD::FMIN;
31249 // Converting this to a max would handle comparisons between positive
31250 // and negative zero incorrectly.
31251 if (!DAG.getTarget().Options.UnsafeFPMath &&
31252 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31254 Opcode = X86ISD::FMAX;
31257 // Converting this to a max would handle NaNs incorrectly, and swapping
31258 // the operands would cause it to handle comparisons between positive
31259 // and negative zero incorrectly.
31260 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31261 if (!DAG.getTarget().Options.UnsafeFPMath &&
31262 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31264 std::swap(LHS, RHS);
31266 Opcode = X86ISD::FMAX;
31269 // Converting this to a max would handle both negative zeros and NaNs
31270 // incorrectly, but we can swap the operands to fix both.
31271 std::swap(LHS, RHS);
31276 Opcode = X86ISD::FMAX;
31279 // Check for x CC y ? y : x -- a min/max with reversed arms.
31280 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
31281 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
31285 // Converting this to a min would handle comparisons between positive
31286 // and negative zero incorrectly, and swapping the operands would
31287 // cause it to handle NaNs incorrectly.
31288 if (!DAG.getTarget().Options.UnsafeFPMath &&
31289 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
31290 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31292 std::swap(LHS, RHS);
31294 Opcode = X86ISD::FMIN;
31297 // Converting this to a min would handle NaNs incorrectly.
31298 if (!DAG.getTarget().Options.UnsafeFPMath &&
31299 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
31301 Opcode = X86ISD::FMIN;
31304 // Converting this to a min would handle both negative zeros and NaNs
31305 // incorrectly, but we can swap the operands to fix both.
31306 std::swap(LHS, RHS);
31311 Opcode = X86ISD::FMIN;
31315 // Converting this to a max would handle NaNs incorrectly.
31316 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31318 Opcode = X86ISD::FMAX;
31321 // Converting this to a max would handle comparisons between positive
31322 // and negative zero incorrectly, and swapping the operands would
31323 // cause it to handle NaNs incorrectly.
31324 if (!DAG.getTarget().Options.UnsafeFPMath &&
31325 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
31326 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31328 std::swap(LHS, RHS);
31330 Opcode = X86ISD::FMAX;
31333 // Converting this to a max would handle both negative zeros and NaNs
31334 // incorrectly, but we can swap the operands to fix both.
31335 std::swap(LHS, RHS);
31340 Opcode = X86ISD::FMAX;
31346 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
31349 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
31350 // lowering on KNL. In this case we convert it to
31351 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
31352 // The same situation for all 128 and 256-bit vectors of i8 and i16.
31353 // Since SKX these selects have a proper lowering.
31354 if (Subtarget.hasAVX512() && CondVT.isVector() &&
31355 CondVT.getVectorElementType() == MVT::i1 &&
31356 (VT.is128BitVector() || VT.is256BitVector()) &&
31357 (VT.getVectorElementType() == MVT::i8 ||
31358 VT.getVectorElementType() == MVT::i16) &&
31359 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
31360 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
31361 DCI.AddToWorklist(Cond.getNode());
31362 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
31365 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
31368 // Canonicalize max and min:
31369 // (x > y) ? x : y -> (x >= y) ? x : y
31370 // (x < y) ? x : y -> (x <= y) ? x : y
31371 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
31372 // the need for an extra compare
31373 // against zero. e.g.
31374 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
31376 // testl %edi, %edi
31378 // cmovgl %edi, %eax
31382 // cmovsl %eax, %edi
31383 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
31384 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31385 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31386 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31391 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
31392 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
31393 Cond.getOperand(0), Cond.getOperand(1), NewCC);
31394 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
31399 // Early exit check
31400 if (!TLI.isTypeLegal(VT))
31403 // Match VSELECTs into subs with unsigned saturation.
31404 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
31405 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
31406 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
31407 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
31408 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31410 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
31411 // left side invert the predicate to simplify logic below.
31413 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
31415 CC = ISD::getSetCCInverse(CC, true);
31416 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
31420 if (Other.getNode() && Other->getNumOperands() == 2 &&
31421 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
31422 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
31423 SDValue CondRHS = Cond->getOperand(1);
31425 // Look for a general sub with unsigned saturation first.
31426 // x >= y ? x-y : 0 --> subus x, y
31427 // x > y ? x-y : 0 --> subus x, y
31428 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
31429 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
31430 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
31432 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
31433 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
31434 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
31435 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
31436 // If the RHS is a constant we have to reverse the const
31437 // canonicalization.
31438 // x > C-1 ? x+-C : 0 --> subus x, C
31439 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
31440 CondRHSConst->getAPIntValue() ==
31441 (-OpRHSConst->getAPIntValue() - 1))
31442 return DAG.getNode(
31443 X86ISD::SUBUS, DL, VT, OpLHS,
31444 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
31446 // Another special case: If C was a sign bit, the sub has been
31447 // canonicalized into a xor.
31448 // FIXME: Would it be better to use computeKnownBits to determine
31449 // whether it's safe to decanonicalize the xor?
31450 // x s< 0 ? x^C : 0 --> subus x, C
31451 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
31452 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
31453 OpRHSConst->getAPIntValue().isSignMask())
31454 // Note that we have to rebuild the RHS constant here to ensure we
31455 // don't rely on particular values of undef lanes.
31456 return DAG.getNode(
31457 X86ISD::SUBUS, DL, VT, OpLHS,
31458 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
31463 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
31466 // If this is a *dynamic* select (non-constant condition) and we can match
31467 // this node with one of the variable blend instructions, restructure the
31468 // condition so that blends can use the high (sign) bit of each element and
31469 // use SimplifyDemandedBits to simplify the condition operand.
31470 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
31471 !DCI.isBeforeLegalize() &&
31472 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
31473 unsigned BitWidth = Cond.getScalarValueSizeInBits();
31475 // Don't optimize vector selects that map to mask-registers.
31479 // We can only handle the cases where VSELECT is directly legal on the
31480 // subtarget. We custom lower VSELECT nodes with constant conditions and
31481 // this makes it hard to see whether a dynamic VSELECT will correctly
31482 // lower, so we both check the operation's status and explicitly handle the
31483 // cases where a *dynamic* blend will fail even though a constant-condition
31484 // blend could be custom lowered.
31485 // FIXME: We should find a better way to handle this class of problems.
31486 // Potentially, we should combine constant-condition vselect nodes
31487 // pre-legalization into shuffles and not mark as many types as custom
31489 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
31491 // FIXME: We don't support i16-element blends currently. We could and
31492 // should support them by making *all* the bits in the condition be set
31493 // rather than just the high bit and using an i8-element blend.
31494 if (VT.getVectorElementType() == MVT::i16)
31496 // Dynamic blending was only available from SSE4.1 onward.
31497 if (VT.is128BitVector() && !Subtarget.hasSSE41())
31499 // Byte blends are only available in AVX2
31500 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
31502 // There are no 512-bit blend instructions that use sign bits.
31503 if (VT.is512BitVector())
31506 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
31507 APInt DemandedMask(APInt::getSignMask(BitWidth));
31509 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
31510 !DCI.isBeforeLegalizeOps());
31511 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
31512 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
31513 // If we changed the computation somewhere in the DAG, this change will
31514 // affect all users of Cond. Make sure it is fine and update all the nodes
31515 // so that we do not use the generic VSELECT anymore. Otherwise, we may
31516 // perform wrong optimizations as we messed with the actual expectation
31517 // for the vector boolean values.
31518 if (Cond != TLO.Old) {
31519 // Check all uses of the condition operand to check whether it will be
31520 // consumed by non-BLEND instructions. Those may require that all bits
31521 // are set properly.
31522 for (SDNode *U : Cond->uses()) {
31523 // TODO: Add other opcodes eventually lowered into BLEND.
31524 if (U->getOpcode() != ISD::VSELECT)
31528 // Update all users of the condition before committing the change, so
31529 // that the VSELECT optimizations that expect the correct vector boolean
31530 // value will not be triggered.
31531 for (SDNode *U : Cond->uses()) {
31532 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
31533 U->getValueType(0), Cond, U->getOperand(1),
31535 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
31537 DCI.CommitTargetLoweringOpt(TLO);
31540 // Only Cond (rather than other nodes in the computation chain) was
31541 // changed. Change the condition just for N to keep the opportunity to
31542 // optimize all other users their own way.
31543 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
31544 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
31549 // Look for vselects with LHS/RHS being bitcasted from an operation that
31550 // can be executed on another type. Push the bitcast to the inputs of
31551 // the operation. This exposes opportunities for using masking instructions.
31552 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
31553 CondVT.getVectorElementType() == MVT::i1) {
31554 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
31555 return SDValue(N, 0);
31556 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
31557 return SDValue(N, 0);
31560 // Custom action for SELECT MMX
31561 if (VT == MVT::x86mmx) {
31562 LHS = DAG.getBitcast(MVT::i64, LHS);
31563 RHS = DAG.getBitcast(MVT::i64, RHS);
31564 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
31565 return DAG.getBitcast(VT, newSelect);
31572 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
31574 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
31575 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
31576 /// Note that this is only legal for some op/cc combinations.
31577 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
31579 const X86Subtarget &Subtarget) {
31580 // This combine only operates on CMP-like nodes.
31581 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31582 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31585 // Can't replace the cmp if it has more uses than the one we're looking at.
31586 // FIXME: We would like to be able to handle this, but would need to make sure
31587 // all uses were updated.
31588 if (!Cmp.hasOneUse())
31591 // This only applies to variations of the common case:
31592 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
31593 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
31594 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
31595 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
31596 // Using the proper condcodes (see below), overflow is checked for.
31598 // FIXME: We can generalize both constraints:
31599 // - XOR/OR/AND (if they were made to survive AtomicExpand)
31601 // if the result is compared.
31603 SDValue CmpLHS = Cmp.getOperand(0);
31604 SDValue CmpRHS = Cmp.getOperand(1);
31606 if (!CmpLHS.hasOneUse())
31609 unsigned Opc = CmpLHS.getOpcode();
31610 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
31613 SDValue OpRHS = CmpLHS.getOperand(2);
31614 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
31618 APInt Addend = OpRHSC->getAPIntValue();
31619 if (Opc == ISD::ATOMIC_LOAD_SUB)
31622 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
31626 APInt Comparison = CmpRHSC->getAPIntValue();
31628 // If the addend is the negation of the comparison value, then we can do
31629 // a full comparison by emitting the atomic arithmetic as a locked sub.
31630 if (Comparison == -Addend) {
31631 // The CC is fine, but we need to rewrite the LHS of the comparison as an
31633 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
31634 auto AtomicSub = DAG.getAtomic(
31635 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
31636 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
31637 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
31638 AN->getMemOperand());
31639 // If the comparision uses the CF flag we can't use INC/DEC instructions.
31640 bool NeedCF = false;
31643 case X86::COND_A: case X86::COND_AE:
31644 case X86::COND_B: case X86::COND_BE:
31648 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
31649 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
31650 DAG.getUNDEF(CmpLHS.getValueType()));
31651 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
31655 // We can handle comparisons with zero in a number of cases by manipulating
31657 if (!Comparison.isNullValue())
31660 if (CC == X86::COND_S && Addend == 1)
31662 else if (CC == X86::COND_NS && Addend == 1)
31664 else if (CC == X86::COND_G && Addend == -1)
31666 else if (CC == X86::COND_LE && Addend == -1)
31671 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
31672 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
31673 DAG.getUNDEF(CmpLHS.getValueType()));
31674 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
31678 // Check whether a boolean test is testing a boolean value generated by
31679 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
31682 // Simplify the following patterns:
31683 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
31684 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
31685 // to (Op EFLAGS Cond)
31687 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
31688 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
31689 // to (Op EFLAGS !Cond)
31691 // where Op could be BRCOND or CMOV.
31693 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
31694 // This combine only operates on CMP-like nodes.
31695 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31696 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31699 // Quit if not used as a boolean value.
31700 if (CC != X86::COND_E && CC != X86::COND_NE)
31703 // Check CMP operands. One of them should be 0 or 1 and the other should be
31704 // an SetCC or extended from it.
31705 SDValue Op1 = Cmp.getOperand(0);
31706 SDValue Op2 = Cmp.getOperand(1);
31709 const ConstantSDNode* C = nullptr;
31710 bool needOppositeCond = (CC == X86::COND_E);
31711 bool checkAgainstTrue = false; // Is it a comparison against 1?
31713 if ((C = dyn_cast<ConstantSDNode>(Op1)))
31715 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
31717 else // Quit if all operands are not constants.
31720 if (C->getZExtValue() == 1) {
31721 needOppositeCond = !needOppositeCond;
31722 checkAgainstTrue = true;
31723 } else if (C->getZExtValue() != 0)
31724 // Quit if the constant is neither 0 or 1.
31727 bool truncatedToBoolWithAnd = false;
31728 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
31729 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
31730 SetCC.getOpcode() == ISD::TRUNCATE ||
31731 SetCC.getOpcode() == ISD::AND) {
31732 if (SetCC.getOpcode() == ISD::AND) {
31734 if (isOneConstant(SetCC.getOperand(0)))
31736 if (isOneConstant(SetCC.getOperand(1)))
31740 SetCC = SetCC.getOperand(OpIdx);
31741 truncatedToBoolWithAnd = true;
31743 SetCC = SetCC.getOperand(0);
31746 switch (SetCC.getOpcode()) {
31747 case X86ISD::SETCC_CARRY:
31748 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
31749 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
31750 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
31751 // truncated to i1 using 'and'.
31752 if (checkAgainstTrue && !truncatedToBoolWithAnd)
31754 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
31755 "Invalid use of SETCC_CARRY!");
31757 case X86ISD::SETCC:
31758 // Set the condition code or opposite one if necessary.
31759 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
31760 if (needOppositeCond)
31761 CC = X86::GetOppositeBranchCondition(CC);
31762 return SetCC.getOperand(1);
31763 case X86ISD::CMOV: {
31764 // Check whether false/true value has canonical one, i.e. 0 or 1.
31765 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
31766 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
31767 // Quit if true value is not a constant.
31770 // Quit if false value is not a constant.
31772 SDValue Op = SetCC.getOperand(0);
31773 // Skip 'zext' or 'trunc' node.
31774 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
31775 Op.getOpcode() == ISD::TRUNCATE)
31776 Op = Op.getOperand(0);
31777 // A special case for rdrand/rdseed, where 0 is set if false cond is
31779 if ((Op.getOpcode() != X86ISD::RDRAND &&
31780 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
31783 // Quit if false value is not the constant 0 or 1.
31784 bool FValIsFalse = true;
31785 if (FVal && FVal->getZExtValue() != 0) {
31786 if (FVal->getZExtValue() != 1)
31788 // If FVal is 1, opposite cond is needed.
31789 needOppositeCond = !needOppositeCond;
31790 FValIsFalse = false;
31792 // Quit if TVal is not the constant opposite of FVal.
31793 if (FValIsFalse && TVal->getZExtValue() != 1)
31795 if (!FValIsFalse && TVal->getZExtValue() != 0)
31797 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
31798 if (needOppositeCond)
31799 CC = X86::GetOppositeBranchCondition(CC);
31800 return SetCC.getOperand(3);
31807 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
31809 /// (X86or (X86setcc) (X86setcc))
31810 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
31811 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
31812 X86::CondCode &CC1, SDValue &Flags,
31814 if (Cond->getOpcode() == X86ISD::CMP) {
31815 if (!isNullConstant(Cond->getOperand(1)))
31818 Cond = Cond->getOperand(0);
31823 SDValue SetCC0, SetCC1;
31824 switch (Cond->getOpcode()) {
31825 default: return false;
31832 SetCC0 = Cond->getOperand(0);
31833 SetCC1 = Cond->getOperand(1);
31837 // Make sure we have SETCC nodes, using the same flags value.
31838 if (SetCC0.getOpcode() != X86ISD::SETCC ||
31839 SetCC1.getOpcode() != X86ISD::SETCC ||
31840 SetCC0->getOperand(1) != SetCC1->getOperand(1))
31843 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
31844 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
31845 Flags = SetCC0->getOperand(1);
31849 // When legalizing carry, we create carries via add X, -1
31850 // If that comes from an actual carry, via setcc, we use the
31852 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
31853 if (EFLAGS.getOpcode() == X86ISD::ADD) {
31854 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
31855 SDValue Carry = EFLAGS.getOperand(0);
31856 while (Carry.getOpcode() == ISD::TRUNCATE ||
31857 Carry.getOpcode() == ISD::ZERO_EXTEND ||
31858 Carry.getOpcode() == ISD::SIGN_EXTEND ||
31859 Carry.getOpcode() == ISD::ANY_EXTEND ||
31860 (Carry.getOpcode() == ISD::AND &&
31861 isOneConstant(Carry.getOperand(1))))
31862 Carry = Carry.getOperand(0);
31863 if (Carry.getOpcode() == X86ISD::SETCC ||
31864 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
31865 if (Carry.getConstantOperandVal(0) == X86::COND_B)
31866 return Carry.getOperand(1);
31874 /// Optimize an EFLAGS definition used according to the condition code \p CC
31875 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
31876 /// uses of chain values.
31877 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
31879 const X86Subtarget &Subtarget) {
31880 if (CC == X86::COND_B)
31881 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
31884 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
31886 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
31889 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
31890 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
31891 TargetLowering::DAGCombinerInfo &DCI,
31892 const X86Subtarget &Subtarget) {
31895 SDValue FalseOp = N->getOperand(0);
31896 SDValue TrueOp = N->getOperand(1);
31897 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
31898 SDValue Cond = N->getOperand(3);
31900 if (CC == X86::COND_E || CC == X86::COND_NE) {
31901 switch (Cond.getOpcode()) {
31905 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
31906 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
31907 return (CC == X86::COND_E) ? FalseOp : TrueOp;
31911 // Try to simplify the EFLAGS and condition code operands.
31912 // We can't always do this as FCMOV only supports a subset of X86 cond.
31913 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
31914 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
31915 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
31917 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
31921 // If this is a select between two integer constants, try to do some
31922 // optimizations. Note that the operands are ordered the opposite of SELECT
31924 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
31925 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
31926 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
31927 // larger than FalseC (the false value).
31928 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
31929 CC = X86::GetOppositeBranchCondition(CC);
31930 std::swap(TrueC, FalseC);
31931 std::swap(TrueOp, FalseOp);
31934 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
31935 // This is efficient for any integer data type (including i8/i16) and
31937 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
31938 Cond = getSETCC(CC, Cond, DL, DAG);
31940 // Zero extend the condition if needed.
31941 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
31943 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
31944 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
31945 DAG.getConstant(ShAmt, DL, MVT::i8));
31949 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
31950 // for any integer data type, including i8/i16.
31951 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
31952 Cond = getSETCC(CC, Cond, DL, DAG);
31954 // Zero extend the condition if needed.
31955 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
31956 FalseC->getValueType(0), Cond);
31957 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31958 SDValue(FalseC, 0));
31962 // Optimize cases that will turn into an LEA instruction. This requires
31963 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31964 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
31965 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31966 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
31968 bool isFastMultiplier = false;
31970 switch ((unsigned char)Diff) {
31972 case 1: // result = add base, cond
31973 case 2: // result = lea base( , cond*2)
31974 case 3: // result = lea base(cond, cond*2)
31975 case 4: // result = lea base( , cond*4)
31976 case 5: // result = lea base(cond, cond*4)
31977 case 8: // result = lea base( , cond*8)
31978 case 9: // result = lea base(cond, cond*8)
31979 isFastMultiplier = true;
31984 if (isFastMultiplier) {
31985 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31986 Cond = getSETCC(CC, Cond, DL ,DAG);
31987 // Zero extend the condition if needed.
31988 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31990 // Scale the condition by the difference.
31992 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31993 DAG.getConstant(Diff, DL, Cond.getValueType()));
31995 // Add the base if non-zero.
31996 if (FalseC->getAPIntValue() != 0)
31997 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31998 SDValue(FalseC, 0));
32005 // Handle these cases:
32006 // (select (x != c), e, c) -> select (x != c), e, x),
32007 // (select (x == c), c, e) -> select (x == c), x, e)
32008 // where the c is an integer constant, and the "select" is the combination
32009 // of CMOV and CMP.
32011 // The rationale for this change is that the conditional-move from a constant
32012 // needs two instructions, however, conditional-move from a register needs
32013 // only one instruction.
32015 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
32016 // some instruction-combining opportunities. This opt needs to be
32017 // postponed as late as possible.
32019 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
32020 // the DCI.xxxx conditions are provided to postpone the optimization as
32021 // late as possible.
32023 ConstantSDNode *CmpAgainst = nullptr;
32024 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
32025 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
32026 !isa<ConstantSDNode>(Cond.getOperand(0))) {
32028 if (CC == X86::COND_NE &&
32029 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
32030 CC = X86::GetOppositeBranchCondition(CC);
32031 std::swap(TrueOp, FalseOp);
32034 if (CC == X86::COND_E &&
32035 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
32036 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
32037 DAG.getConstant(CC, DL, MVT::i8), Cond };
32038 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32043 // Fold and/or of setcc's to double CMOV:
32044 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
32045 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
32047 // This combine lets us generate:
32048 // cmovcc1 (jcc1 if we don't have CMOV)
32054 // cmovne (jne if we don't have CMOV)
32055 // When we can't use the CMOV instruction, it might increase branch
32057 // When we can use CMOV, or when there is no mispredict, this improves
32058 // throughput and reduces register pressure.
32060 if (CC == X86::COND_NE) {
32062 X86::CondCode CC0, CC1;
32064 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
32066 std::swap(FalseOp, TrueOp);
32067 CC0 = X86::GetOppositeBranchCondition(CC0);
32068 CC1 = X86::GetOppositeBranchCondition(CC1);
32071 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
32073 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
32074 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
32075 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32083 /// Different mul shrinking modes.
32084 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
32086 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
32087 EVT VT = N->getOperand(0).getValueType();
32088 if (VT.getScalarSizeInBits() != 32)
32091 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
32092 unsigned SignBits[2] = {1, 1};
32093 bool IsPositive[2] = {false, false};
32094 for (unsigned i = 0; i < 2; i++) {
32095 SDValue Opd = N->getOperand(i);
32097 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
32098 // compute signbits for it separately.
32099 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
32100 // For anyextend, it is safe to assume an appropriate number of leading
32102 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
32104 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
32109 IsPositive[i] = true;
32110 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
32111 // All the operands of BUILD_VECTOR need to be int constant.
32112 // Find the smallest value range which all the operands belong to.
32114 IsPositive[i] = true;
32115 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
32116 if (SubOp.isUndef())
32118 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
32121 APInt IntVal = CN->getAPIntValue();
32122 if (IntVal.isNegative())
32123 IsPositive[i] = false;
32124 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
32127 SignBits[i] = DAG.ComputeNumSignBits(Opd);
32128 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
32129 IsPositive[i] = true;
32133 bool AllPositive = IsPositive[0] && IsPositive[1];
32134 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
32135 // When ranges are from -128 ~ 127, use MULS8 mode.
32136 if (MinSignBits >= 25)
32138 // When ranges are from 0 ~ 255, use MULU8 mode.
32139 else if (AllPositive && MinSignBits >= 24)
32141 // When ranges are from -32768 ~ 32767, use MULS16 mode.
32142 else if (MinSignBits >= 17)
32144 // When ranges are from 0 ~ 65535, use MULU16 mode.
32145 else if (AllPositive && MinSignBits >= 16)
32152 /// When the operands of vector mul are extended from smaller size values,
32153 /// like i8 and i16, the type of mul may be shrinked to generate more
32154 /// efficient code. Two typical patterns are handled:
32156 /// %2 = sext/zext <N x i8> %1 to <N x i32>
32157 /// %4 = sext/zext <N x i8> %3 to <N x i32>
32158 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32159 /// %5 = mul <N x i32> %2, %4
32162 /// %2 = zext/sext <N x i16> %1 to <N x i32>
32163 /// %4 = zext/sext <N x i16> %3 to <N x i32>
32164 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32165 /// %5 = mul <N x i32> %2, %4
32167 /// There are four mul shrinking modes:
32168 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
32169 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
32170 /// generate pmullw+sext32 for it (MULS8 mode).
32171 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
32172 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
32173 /// generate pmullw+zext32 for it (MULU8 mode).
32174 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
32175 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
32176 /// generate pmullw+pmulhw for it (MULS16 mode).
32177 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
32178 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
32179 /// generate pmullw+pmulhuw for it (MULU16 mode).
32180 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
32181 const X86Subtarget &Subtarget) {
32182 // Check for legality
32183 // pmullw/pmulhw are not supported by SSE.
32184 if (!Subtarget.hasSSE2())
32187 // Check for profitability
32188 // pmulld is supported since SSE41. It is better to use pmulld
32189 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
32191 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
32192 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
32196 if (!canReduceVMulWidth(N, DAG, Mode))
32200 SDValue N0 = N->getOperand(0);
32201 SDValue N1 = N->getOperand(1);
32202 EVT VT = N->getOperand(0).getValueType();
32203 unsigned NumElts = VT.getVectorNumElements();
32204 if ((NumElts % 2) != 0)
32207 unsigned RegSize = 128;
32208 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
32209 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
32211 // Shrink the operands of mul.
32212 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
32213 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
32215 if (NumElts >= OpsVT.getVectorNumElements()) {
32216 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
32217 // lower part is needed.
32218 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
32219 if (Mode == MULU8 || Mode == MULS8) {
32220 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
32223 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
32224 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
32225 // the higher part is also needed.
32226 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32227 ReducedVT, NewN0, NewN1);
32229 // Repack the lower part and higher part result of mul into a wider
32231 // Generate shuffle functioning as punpcklwd.
32232 SmallVector<int, 16> ShuffleMask(NumElts);
32233 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32234 ShuffleMask[2 * i] = i;
32235 ShuffleMask[2 * i + 1] = i + NumElts;
32238 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32239 ResLo = DAG.getBitcast(ResVT, ResLo);
32240 // Generate shuffle functioning as punpckhwd.
32241 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32242 ShuffleMask[2 * i] = i + NumElts / 2;
32243 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
32246 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32247 ResHi = DAG.getBitcast(ResVT, ResHi);
32248 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
32251 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
32252 // to legalize the mul explicitly because implicit legalization for type
32253 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
32254 // instructions which will not exist when we explicitly legalize it by
32255 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
32256 // <4 x i16> undef).
32258 // Legalize the operands of mul.
32259 // FIXME: We may be able to handle non-concatenated vectors by insertion.
32260 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
32261 if ((RegSize % ReducedSizeInBits) != 0)
32264 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
32265 DAG.getUNDEF(ReducedVT));
32267 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32269 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32271 if (Mode == MULU8 || Mode == MULS8) {
32272 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
32274 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32276 // convert the type of mul result to VT.
32277 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32278 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
32279 : ISD::SIGN_EXTEND_VECTOR_INREG,
32281 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32282 DAG.getIntPtrConstant(0, DL));
32284 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
32285 // MULU16/MULS16, both parts are needed.
32286 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32287 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32288 OpsVT, NewN0, NewN1);
32290 // Repack the lower part and higher part result of mul into a wider
32291 // result. Make sure the type of mul result is VT.
32292 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32293 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
32294 Res = DAG.getBitcast(ResVT, Res);
32295 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32296 DAG.getIntPtrConstant(0, DL));
32301 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
32302 EVT VT, SDLoc DL) {
32304 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
32305 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32306 DAG.getConstant(Mult, DL, VT));
32307 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
32308 DAG.getConstant(Shift, DL, MVT::i8));
32309 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32314 auto combineMulMulAddOrSub = [&](bool isAdd) {
32315 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32316 DAG.getConstant(9, DL, VT));
32317 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
32318 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32327 // mul x, 11 => add ((shl (mul x, 5), 1), x)
32328 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
32330 // mul x, 21 => add ((shl (mul x, 5), 2), x)
32331 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
32333 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
32334 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32335 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
32337 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
32338 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
32340 // mul x, 13 => add ((shl (mul x, 3), 2), x)
32341 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
32343 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
32344 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
32346 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
32347 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32348 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
32350 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
32351 return combineMulMulAddOrSub(/*isAdd*/ false);
32353 // mul x, 28 => add ((mul (mul x, 9), 3), x)
32354 return combineMulMulAddOrSub(/*isAdd*/ true);
32356 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
32357 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32358 combineMulMulAddOrSub(/*isAdd*/ true));
32360 // mul x, 30 => sub (sub ((shl x, 5), x), x)
32361 return DAG.getNode(
32363 DAG.getNode(ISD::SUB, DL, VT,
32364 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32365 DAG.getConstant(5, DL, MVT::i8)),
32372 /// Optimize a single multiply with constant into two operations in order to
32373 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
32374 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
32375 TargetLowering::DAGCombinerInfo &DCI,
32376 const X86Subtarget &Subtarget) {
32377 EVT VT = N->getValueType(0);
32378 if (DCI.isBeforeLegalize() && VT.isVector())
32379 return reduceVMULWidth(N, DAG, Subtarget);
32381 if (!MulConstantOptimization)
32383 // An imul is usually smaller than the alternative sequence.
32384 if (DAG.getMachineFunction().getFunction().optForMinSize())
32387 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
32390 if (VT != MVT::i64 && VT != MVT::i32)
32393 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
32396 uint64_t MulAmt = C->getZExtValue();
32397 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
32400 uint64_t MulAmt1 = 0;
32401 uint64_t MulAmt2 = 0;
32402 if ((MulAmt % 9) == 0) {
32404 MulAmt2 = MulAmt / 9;
32405 } else if ((MulAmt % 5) == 0) {
32407 MulAmt2 = MulAmt / 5;
32408 } else if ((MulAmt % 3) == 0) {
32410 MulAmt2 = MulAmt / 3;
32416 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
32418 if (isPowerOf2_64(MulAmt2) &&
32419 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
32420 // If second multiplifer is pow2, issue it first. We want the multiply by
32421 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
32423 std::swap(MulAmt1, MulAmt2);
32425 if (isPowerOf2_64(MulAmt1))
32426 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32427 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
32429 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32430 DAG.getConstant(MulAmt1, DL, VT));
32432 if (isPowerOf2_64(MulAmt2))
32433 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
32434 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
32436 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
32437 DAG.getConstant(MulAmt2, DL, VT));
32438 } else if (!Subtarget.slowLEA())
32439 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
32442 assert(MulAmt != 0 &&
32443 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
32444 "Both cases that could cause potential overflows should have "
32445 "already been handled.");
32446 int64_t SignMulAmt = C->getSExtValue();
32447 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
32448 (SignMulAmt != -INT64_MAX)) {
32449 int NumSign = SignMulAmt > 0 ? 1 : -1;
32450 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
32451 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
32452 if (IsPowerOf2_64PlusOne) {
32453 // (mul x, 2^N + 1) => (add (shl x, N), x)
32454 NewMul = DAG.getNode(
32455 ISD::ADD, DL, VT, N->getOperand(0),
32456 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32457 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
32459 } else if (IsPowerOf2_64MinusOne) {
32460 // (mul x, 2^N - 1) => (sub (shl x, N), x)
32461 NewMul = DAG.getNode(
32463 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32464 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
32468 // To negate, subtract the number from zero
32469 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
32471 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
32476 // Do not add new nodes to DAG combiner worklist.
32477 DCI.CombineTo(N, NewMul, false);
32482 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
32483 SDValue N0 = N->getOperand(0);
32484 SDValue N1 = N->getOperand(1);
32485 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
32486 EVT VT = N0.getValueType();
32488 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
32489 // since the result of setcc_c is all zero's or all ones.
32490 if (VT.isInteger() && !VT.isVector() &&
32491 N1C && N0.getOpcode() == ISD::AND &&
32492 N0.getOperand(1).getOpcode() == ISD::Constant) {
32493 SDValue N00 = N0.getOperand(0);
32494 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
32495 Mask <<= N1C->getAPIntValue();
32496 bool MaskOK = false;
32497 // We can handle cases concerning bit-widening nodes containing setcc_c if
32498 // we carefully interrogate the mask to make sure we are semantics
32500 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
32501 // of the underlying setcc_c operation if the setcc_c was zero extended.
32502 // Consider the following example:
32503 // zext(setcc_c) -> i32 0x0000FFFF
32504 // c1 -> i32 0x0000FFFF
32505 // c2 -> i32 0x00000001
32506 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
32507 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
32508 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32510 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
32511 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32513 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
32514 N00.getOpcode() == ISD::ANY_EXTEND) &&
32515 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32516 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
32518 if (MaskOK && Mask != 0) {
32520 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
32524 // Hardware support for vector shifts is sparse which makes us scalarize the
32525 // vector operations in many cases. Also, on sandybridge ADD is faster than
32527 // (shl V, 1) -> add V,V
32528 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
32529 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
32530 assert(N0.getValueType().isVector() && "Invalid vector shift type");
32531 // We shift all of the values by one. In many cases we do not have
32532 // hardware support for this operation. This is better expressed as an ADD
32534 if (N1SplatC->getAPIntValue() == 1)
32535 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
32541 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
32542 SDValue N0 = N->getOperand(0);
32543 SDValue N1 = N->getOperand(1);
32544 EVT VT = N0.getValueType();
32545 unsigned Size = VT.getSizeInBits();
32547 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
32548 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
32549 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
32550 // depending on sign of (SarConst - [56,48,32,24,16])
32552 // sexts in X86 are MOVs. The MOVs have the same code size
32553 // as above SHIFTs (only SHIFT on 1 has lower code size).
32554 // However the MOVs have 2 advantages to a SHIFT:
32555 // 1. MOVs can write to a register that differs from source
32556 // 2. MOVs accept memory operands
32558 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
32559 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
32560 N0.getOperand(1).getOpcode() != ISD::Constant)
32563 SDValue N00 = N0.getOperand(0);
32564 SDValue N01 = N0.getOperand(1);
32565 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
32566 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
32567 EVT CVT = N1.getValueType();
32569 if (SarConst.isNegative())
32572 for (MVT SVT : MVT::integer_valuetypes()) {
32573 unsigned ShiftSize = SVT.getSizeInBits();
32574 // skipping types without corresponding sext/zext and
32575 // ShlConst that is not one of [56,48,32,24,16]
32576 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
32580 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
32581 SarConst = SarConst - (Size - ShiftSize);
32584 else if (SarConst.isNegative())
32585 return DAG.getNode(ISD::SHL, DL, VT, NN,
32586 DAG.getConstant(-SarConst, DL, CVT));
32588 return DAG.getNode(ISD::SRA, DL, VT, NN,
32589 DAG.getConstant(SarConst, DL, CVT));
32594 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
32595 SDValue N0 = N->getOperand(0);
32596 SDValue N1 = N->getOperand(1);
32597 EVT VT = N0.getValueType();
32599 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
32600 // TODO: This is a generic DAG combine that became an x86-only combine to
32601 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
32602 // and-not ('andn').
32603 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
32606 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
32607 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32608 if (!ShiftC || !AndC)
32611 // If we can shrink the constant mask below 8-bits or 32-bits, then this
32612 // transform should reduce code size. It may also enable secondary transforms
32613 // from improved known-bits analysis or instruction selection.
32614 APInt MaskVal = AndC->getAPIntValue();
32615 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
32616 unsigned OldMaskSize = MaskVal.getMinSignedBits();
32617 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
32618 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
32619 (OldMaskSize > 32 && NewMaskSize <= 32)) {
32620 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
32622 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
32623 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
32624 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
32629 /// \brief Returns a vector of 0s if the node in input is a vector logical
32630 /// shift by a constant amount which is known to be bigger than or equal
32631 /// to the vector element size in bits.
32632 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
32633 const X86Subtarget &Subtarget) {
32634 EVT VT = N->getValueType(0);
32636 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
32637 (!Subtarget.hasInt256() ||
32638 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
32641 SDValue Amt = N->getOperand(1);
32643 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
32644 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
32645 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
32646 unsigned MaxAmount =
32647 VT.getSimpleVT().getScalarSizeInBits();
32649 // SSE2/AVX2 logical shifts always return a vector of 0s
32650 // if the shift amount is bigger than or equal to
32651 // the element size. The constant shift amount will be
32652 // encoded as a 8-bit immediate.
32653 if (ShiftAmt.trunc(8).uge(MaxAmount))
32654 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
32660 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
32661 TargetLowering::DAGCombinerInfo &DCI,
32662 const X86Subtarget &Subtarget) {
32663 if (N->getOpcode() == ISD::SHL)
32664 if (SDValue V = combineShiftLeft(N, DAG))
32667 if (N->getOpcode() == ISD::SRA)
32668 if (SDValue V = combineShiftRightArithmetic(N, DAG))
32671 if (N->getOpcode() == ISD::SRL)
32672 if (SDValue V = combineShiftRightLogical(N, DAG))
32675 // Try to fold this logical shift into a zero vector.
32676 if (N->getOpcode() != ISD::SRA)
32677 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
32683 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
32684 TargetLowering::DAGCombinerInfo &DCI,
32685 const X86Subtarget &Subtarget) {
32686 unsigned Opcode = N->getOpcode();
32687 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
32688 "Unexpected shift opcode");
32690 EVT VT = N->getValueType(0);
32691 SDValue N0 = N->getOperand(0);
32692 SDValue N1 = N->getOperand(1);
32693 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
32694 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
32695 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
32696 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
32697 "Unexpected PACKSS/PACKUS input type");
32699 // Constant Folding.
32700 APInt UndefElts0, UndefElts1;
32701 SmallVector<APInt, 32> EltBits0, EltBits1;
32702 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
32703 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
32704 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
32705 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
32706 unsigned NumLanes = VT.getSizeInBits() / 128;
32707 unsigned NumDstElts = VT.getVectorNumElements();
32708 unsigned NumSrcElts = NumDstElts / 2;
32709 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
32710 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
32711 bool IsSigned = (X86ISD::PACKSS == Opcode);
32713 APInt Undefs(NumDstElts, 0);
32714 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
32715 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
32716 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
32717 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
32718 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
32719 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
32721 if (UndefElts[SrcIdx]) {
32722 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
32726 APInt &Val = EltBits[SrcIdx];
32728 // PACKSS: Truncate signed value with signed saturation.
32729 // Source values less than dst minint are saturated to minint.
32730 // Source values greater than dst maxint are saturated to maxint.
32731 if (Val.isSignedIntN(DstBitsPerElt))
32732 Val = Val.trunc(DstBitsPerElt);
32733 else if (Val.isNegative())
32734 Val = APInt::getSignedMinValue(DstBitsPerElt);
32736 Val = APInt::getSignedMaxValue(DstBitsPerElt);
32738 // PACKUS: Truncate signed value with unsigned saturation.
32739 // Source values less than zero are saturated to zero.
32740 // Source values greater than dst maxuint are saturated to maxuint.
32741 if (Val.isIntN(DstBitsPerElt))
32742 Val = Val.trunc(DstBitsPerElt);
32743 else if (Val.isNegative())
32744 Val = APInt::getNullValue(DstBitsPerElt);
32746 Val = APInt::getAllOnesValue(DstBitsPerElt);
32748 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
32752 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
32755 // Attempt to combine as shuffle.
32757 if (SDValue Res = combineX86ShufflesRecursively(
32758 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32759 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32760 DCI.CombineTo(N, Res);
32767 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
32768 TargetLowering::DAGCombinerInfo &DCI,
32769 const X86Subtarget &Subtarget) {
32770 unsigned Opcode = N->getOpcode();
32771 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
32772 X86ISD::VSRLI == Opcode) &&
32773 "Unexpected shift opcode");
32774 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
32775 EVT VT = N->getValueType(0);
32776 SDValue N0 = N->getOperand(0);
32777 SDValue N1 = N->getOperand(1);
32778 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
32779 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
32780 "Unexpected value type");
32782 // Out of range logical bit shifts are guaranteed to be zero.
32783 // Out of range arithmetic bit shifts splat the sign bit.
32784 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
32785 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
32787 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
32789 ShiftVal = NumBitsPerElt - 1;
32792 // Shift N0 by zero -> N0.
32796 // Shift zero -> zero.
32797 if (ISD::isBuildVectorAllZeros(N0.getNode()))
32798 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
32800 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
32801 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
32802 // TODO - support other sra opcodes as needed.
32803 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
32804 N0.getOpcode() == X86ISD::VSRAI)
32805 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
32807 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
32808 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
32809 N1 == N0.getOperand(1)) {
32810 SDValue N00 = N0.getOperand(0);
32811 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
32812 if (ShiftVal.ult(NumSignBits))
32816 // We can decode 'whole byte' logical bit shifts as shuffles.
32817 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
32819 if (SDValue Res = combineX86ShufflesRecursively(
32820 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32821 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32822 DCI.CombineTo(N, Res);
32827 // Constant Folding.
32829 SmallVector<APInt, 32> EltBits;
32830 if (N->isOnlyUserOf(N0.getNode()) &&
32831 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
32832 assert(EltBits.size() == VT.getVectorNumElements() &&
32833 "Unexpected shift value type");
32834 unsigned ShiftImm = ShiftVal.getZExtValue();
32835 for (APInt &Elt : EltBits) {
32836 if (X86ISD::VSHLI == Opcode)
32838 else if (X86ISD::VSRAI == Opcode)
32839 Elt.ashrInPlace(ShiftImm);
32841 Elt.lshrInPlace(ShiftImm);
32843 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
32849 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
32850 TargetLowering::DAGCombinerInfo &DCI,
32851 const X86Subtarget &Subtarget) {
32853 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
32854 (N->getOpcode() == X86ISD::PINSRW &&
32855 N->getValueType(0) == MVT::v8i16)) &&
32856 "Unexpected vector insertion");
32858 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
32860 if (SDValue Res = combineX86ShufflesRecursively(
32861 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32862 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32863 DCI.CombineTo(N, Res);
32870 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
32871 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
32872 /// OR -> CMPNEQSS.
32873 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
32874 TargetLowering::DAGCombinerInfo &DCI,
32875 const X86Subtarget &Subtarget) {
32878 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
32879 // we're requiring SSE2 for both.
32880 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
32881 SDValue N0 = N->getOperand(0);
32882 SDValue N1 = N->getOperand(1);
32883 SDValue CMP0 = N0->getOperand(1);
32884 SDValue CMP1 = N1->getOperand(1);
32887 // The SETCCs should both refer to the same CMP.
32888 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
32891 SDValue CMP00 = CMP0->getOperand(0);
32892 SDValue CMP01 = CMP0->getOperand(1);
32893 EVT VT = CMP00.getValueType();
32895 if (VT == MVT::f32 || VT == MVT::f64) {
32896 bool ExpectingFlags = false;
32897 // Check for any users that want flags:
32898 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
32899 !ExpectingFlags && UI != UE; ++UI)
32900 switch (UI->getOpcode()) {
32905 ExpectingFlags = true;
32907 case ISD::CopyToReg:
32908 case ISD::SIGN_EXTEND:
32909 case ISD::ZERO_EXTEND:
32910 case ISD::ANY_EXTEND:
32914 if (!ExpectingFlags) {
32915 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
32916 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
32918 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
32919 X86::CondCode tmp = cc0;
32924 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
32925 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
32926 // FIXME: need symbolic constants for these magic numbers.
32927 // See X86ATTInstPrinter.cpp:printSSECC().
32928 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
32929 if (Subtarget.hasAVX512()) {
32931 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
32932 DAG.getConstant(x86cc, DL, MVT::i8));
32933 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
32934 N->getSimpleValueType(0), FSetCC,
32935 DAG.getIntPtrConstant(0, DL));
32937 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
32938 CMP00.getValueType(), CMP00, CMP01,
32939 DAG.getConstant(x86cc, DL,
32942 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
32943 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
32945 if (is64BitFP && !Subtarget.is64Bit()) {
32946 // On a 32-bit target, we cannot bitcast the 64-bit float to a
32947 // 64-bit integer, since that's not a legal type. Since
32948 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
32949 // bits, but can do this little dance to extract the lowest 32 bits
32950 // and work with those going forward.
32951 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
32953 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
32954 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
32955 Vector32, DAG.getIntPtrConstant(0, DL));
32959 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
32960 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
32961 DAG.getConstant(1, DL, IntVT));
32962 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
32964 return OneBitOfTruth;
32972 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
32973 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
32974 assert(N->getOpcode() == ISD::AND);
32976 EVT VT = N->getValueType(0);
32977 SDValue N0 = N->getOperand(0);
32978 SDValue N1 = N->getOperand(1);
32981 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
32984 if (N0.getOpcode() == ISD::XOR &&
32985 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
32986 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
32988 if (N1.getOpcode() == ISD::XOR &&
32989 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
32990 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
32995 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
32996 // register. In most cases we actually compare or select YMM-sized registers
32997 // and mixing the two types creates horrible code. This method optimizes
32998 // some of the transition sequences.
32999 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
33000 TargetLowering::DAGCombinerInfo &DCI,
33001 const X86Subtarget &Subtarget) {
33002 EVT VT = N->getValueType(0);
33003 if (!VT.is256BitVector())
33006 assert((N->getOpcode() == ISD::ANY_EXTEND ||
33007 N->getOpcode() == ISD::ZERO_EXTEND ||
33008 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
33010 SDValue Narrow = N->getOperand(0);
33011 EVT NarrowVT = Narrow->getValueType(0);
33012 if (!NarrowVT.is128BitVector())
33015 if (Narrow->getOpcode() != ISD::XOR &&
33016 Narrow->getOpcode() != ISD::AND &&
33017 Narrow->getOpcode() != ISD::OR)
33020 SDValue N0 = Narrow->getOperand(0);
33021 SDValue N1 = Narrow->getOperand(1);
33024 // The Left side has to be a trunc.
33025 if (N0.getOpcode() != ISD::TRUNCATE)
33028 // The type of the truncated inputs.
33029 EVT WideVT = N0->getOperand(0)->getValueType(0);
33033 // The right side has to be a 'trunc' or a constant vector.
33034 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
33035 ConstantSDNode *RHSConstSplat = nullptr;
33036 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
33037 RHSConstSplat = RHSBV->getConstantSplatNode();
33038 if (!RHSTrunc && !RHSConstSplat)
33041 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33043 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
33046 // Set N0 and N1 to hold the inputs to the new wide operation.
33047 N0 = N0->getOperand(0);
33048 if (RHSConstSplat) {
33049 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
33050 SDValue(RHSConstSplat, 0));
33051 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
33052 } else if (RHSTrunc) {
33053 N1 = N1->getOperand(0);
33056 // Generate the wide operation.
33057 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
33058 unsigned Opcode = N->getOpcode();
33060 case ISD::ANY_EXTEND:
33062 case ISD::ZERO_EXTEND: {
33063 unsigned InBits = NarrowVT.getScalarSizeInBits();
33064 APInt Mask = APInt::getAllOnesValue(InBits);
33065 Mask = Mask.zext(VT.getScalarSizeInBits());
33066 return DAG.getNode(ISD::AND, DL, VT,
33067 Op, DAG.getConstant(Mask, DL, VT));
33069 case ISD::SIGN_EXTEND:
33070 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
33071 Op, DAG.getValueType(NarrowVT));
33073 llvm_unreachable("Unexpected opcode");
33077 /// If both input operands of a logic op are being cast from floating point
33078 /// types, try to convert this into a floating point logic node to avoid
33079 /// unnecessary moves from SSE to integer registers.
33080 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
33081 const X86Subtarget &Subtarget) {
33082 unsigned FPOpcode = ISD::DELETED_NODE;
33083 if (N->getOpcode() == ISD::AND)
33084 FPOpcode = X86ISD::FAND;
33085 else if (N->getOpcode() == ISD::OR)
33086 FPOpcode = X86ISD::FOR;
33087 else if (N->getOpcode() == ISD::XOR)
33088 FPOpcode = X86ISD::FXOR;
33090 assert(FPOpcode != ISD::DELETED_NODE &&
33091 "Unexpected input node for FP logic conversion");
33093 EVT VT = N->getValueType(0);
33094 SDValue N0 = N->getOperand(0);
33095 SDValue N1 = N->getOperand(1);
33097 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
33098 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
33099 (Subtarget.hasSSE2() && VT == MVT::i64))) {
33100 SDValue N00 = N0.getOperand(0);
33101 SDValue N10 = N1.getOperand(0);
33102 EVT N00Type = N00.getValueType();
33103 EVT N10Type = N10.getValueType();
33104 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
33105 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
33106 return DAG.getBitcast(VT, FPLogic);
33112 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
33113 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
33114 /// with a shift-right to eliminate loading the vector constant mask value.
33115 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
33116 const X86Subtarget &Subtarget) {
33117 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
33118 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
33119 EVT VT0 = Op0.getValueType();
33120 EVT VT1 = Op1.getValueType();
33122 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
33126 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
33127 !SplatVal.isMask())
33130 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
33133 unsigned EltBitWidth = VT0.getScalarSizeInBits();
33134 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
33138 unsigned ShiftVal = SplatVal.countTrailingOnes();
33139 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
33140 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
33141 return DAG.getBitcast(N->getValueType(0), Shift);
33144 // Get the index node from the lowered DAG of a GEP IR instruction with one
33145 // indexing dimension.
33146 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
33147 if (Ld->isIndexed())
33150 SDValue Base = Ld->getBasePtr();
33152 if (Base.getOpcode() != ISD::ADD)
33155 SDValue ShiftedIndex = Base.getOperand(0);
33157 if (ShiftedIndex.getOpcode() != ISD::SHL)
33160 return ShiftedIndex.getOperand(0);
33164 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
33165 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
33166 switch (VT.getSizeInBits()) {
33167 default: return false;
33168 case 64: return Subtarget.is64Bit() ? true : false;
33169 case 32: return true;
33175 // This function recognizes cases where X86 bzhi instruction can replace and
33176 // 'and-load' sequence.
33177 // In case of loading integer value from an array of constants which is defined
33180 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
33182 // then applying a bitwise and on the result with another input.
33183 // It's equivalent to performing bzhi (zero high bits) on the input, with the
33184 // same index of the load.
33185 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
33186 const X86Subtarget &Subtarget) {
33187 MVT VT = Node->getSimpleValueType(0);
33190 // Check if subtarget has BZHI instruction for the node's type
33191 if (!hasBZHI(Subtarget, VT))
33194 // Try matching the pattern for both operands.
33195 for (unsigned i = 0; i < 2; i++) {
33196 SDValue N = Node->getOperand(i);
33197 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
33199 // continue if the operand is not a load instruction
33203 const Value *MemOp = Ld->getMemOperand()->getValue();
33208 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
33209 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
33210 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
33212 Constant *Init = GV->getInitializer();
33213 Type *Ty = Init->getType();
33214 if (!isa<ConstantDataArray>(Init) ||
33215 !Ty->getArrayElementType()->isIntegerTy() ||
33216 Ty->getArrayElementType()->getScalarSizeInBits() !=
33217 VT.getSizeInBits() ||
33218 Ty->getArrayNumElements() >
33219 Ty->getArrayElementType()->getScalarSizeInBits())
33222 // Check if the array's constant elements are suitable to our case.
33223 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
33224 bool ConstantsMatch = true;
33225 for (uint64_t j = 0; j < ArrayElementCount; j++) {
33226 ConstantInt *Elem =
33227 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
33228 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
33229 ConstantsMatch = false;
33233 if (!ConstantsMatch)
33236 // Do the transformation (For 32-bit type):
33237 // -> (and (load arr[idx]), inp)
33238 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
33239 // that will be replaced with one bzhi instruction.
33240 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
33241 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
33243 // Get the Node which indexes into the array.
33244 SDValue Index = getIndexFromUnindexedLoad(Ld);
33247 Index = DAG.getZExtOrTrunc(Index, dl, VT);
33249 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
33251 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
33252 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
33254 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
33262 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
33263 TargetLowering::DAGCombinerInfo &DCI,
33264 const X86Subtarget &Subtarget) {
33265 EVT VT = N->getValueType(0);
33267 // If this is SSE1 only convert to FAND to avoid scalarization.
33268 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33269 return DAG.getBitcast(
33270 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
33271 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
33272 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
33275 if (DCI.isBeforeLegalizeOps())
33278 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33281 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33284 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
33287 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
33290 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
33293 // Attempt to recursively combine a bitmask AND with shuffles.
33294 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33296 if (SDValue Res = combineX86ShufflesRecursively(
33297 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33298 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33299 DCI.CombineTo(N, Res);
33304 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
33305 if ((VT.getScalarSizeInBits() % 8) == 0 &&
33306 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33307 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
33308 SDValue BitMask = N->getOperand(1);
33309 SDValue SrcVec = N->getOperand(0).getOperand(0);
33310 EVT SrcVecVT = SrcVec.getValueType();
33312 // Check that the constant bitmask masks whole bytes.
33314 SmallVector<APInt, 64> EltBits;
33315 if (VT == SrcVecVT.getScalarType() &&
33316 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
33317 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
33318 llvm::all_of(EltBits, [](APInt M) {
33319 return M.isNullValue() || M.isAllOnesValue();
33321 unsigned NumElts = SrcVecVT.getVectorNumElements();
33322 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
33323 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
33325 // Create a root shuffle mask from the byte mask and the extracted index.
33326 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
33327 for (unsigned i = 0; i != Scale; ++i) {
33330 int VecIdx = Scale * Idx + i;
33331 ShuffleMask[VecIdx] =
33332 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
33335 if (SDValue Shuffle = combineX86ShufflesRecursively(
33336 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
33337 /*HasVarMask*/ false, DAG, DCI, Subtarget))
33338 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
33339 N->getOperand(0).getOperand(1));
33347 // (or (and (m, y), (pandn m, x)))
33349 // (vselect m, x, y)
33350 // As a special case, try to fold:
33351 // (or (and (m, (sub 0, x)), (pandn m, x)))
33353 // (sub (xor X, M), M)
33354 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
33355 const X86Subtarget &Subtarget) {
33356 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
33358 SDValue N0 = N->getOperand(0);
33359 SDValue N1 = N->getOperand(1);
33360 EVT VT = N->getValueType(0);
33362 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
33363 (VT.is256BitVector() && Subtarget.hasInt256())))
33366 // Canonicalize AND to LHS.
33367 if (N1.getOpcode() == ISD::AND)
33370 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
33371 // ANDNP combine allows other combines to happen that prevent matching.
33372 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
33375 SDValue Mask = N1.getOperand(0);
33376 SDValue X = N1.getOperand(1);
33378 if (N0.getOperand(0) == Mask)
33379 Y = N0.getOperand(1);
33380 if (N0.getOperand(1) == Mask)
33381 Y = N0.getOperand(0);
33383 // Check to see if the mask appeared in both the AND and ANDNP.
33387 // Validate that X, Y, and Mask are bitcasts, and see through them.
33388 Mask = peekThroughBitcasts(Mask);
33389 X = peekThroughBitcasts(X);
33390 Y = peekThroughBitcasts(Y);
33392 EVT MaskVT = Mask.getValueType();
33393 unsigned EltBits = MaskVT.getScalarSizeInBits();
33395 // TODO: Attempt to handle floating point cases as well?
33396 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
33402 // (or (and (M, (sub 0, X)), (pandn M, X)))
33403 // which is a special case of vselect:
33404 // (vselect M, (sub 0, X), X)
33406 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
33407 // We know that, if fNegate is 0 or 1:
33408 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
33410 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
33411 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
33412 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
33413 // This lets us transform our vselect to:
33414 // (add (xor X, M), (and M, 1))
33416 // (sub (xor X, M), M)
33417 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
33418 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
33419 auto IsNegV = [](SDNode *N, SDValue V) {
33420 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
33421 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
33424 if (IsNegV(Y.getNode(), X))
33426 else if (IsNegV(X.getNode(), Y))
33430 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
33431 SDValue SubOp2 = Mask;
33433 // If the negate was on the false side of the select, then
33434 // the operands of the SUB need to be swapped. PR 27251.
33435 // This is because the pattern being matched above is
33436 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
33437 // but if the pattern matched was
33438 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
33439 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
33440 // pattern also needs to be a negation of the replacement pattern above.
33441 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
33442 // sub accomplishes the negation of the replacement pattern.
33444 std::swap(SubOp1, SubOp2);
33446 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
33447 return DAG.getBitcast(VT, Res);
33451 // PBLENDVB is only available on SSE 4.1.
33452 if (!Subtarget.hasSSE41())
33455 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
33457 X = DAG.getBitcast(BlendVT, X);
33458 Y = DAG.getBitcast(BlendVT, Y);
33459 Mask = DAG.getBitcast(BlendVT, Mask);
33460 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
33461 return DAG.getBitcast(VT, Mask);
33464 // Helper function for combineOrCmpEqZeroToCtlzSrl
33468 // srl(ctlz x), log2(bitsize(x))
33469 // Input pattern is checked by caller.
33470 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
33471 SelectionDAG &DAG) {
33472 SDValue Cmp = Op.getOperand(1);
33473 EVT VT = Cmp.getOperand(0).getValueType();
33474 unsigned Log2b = Log2_32(VT.getSizeInBits());
33476 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
33477 // The result of the shift is true or false, and on X86, the 32-bit
33478 // encoding of shr and lzcnt is more desirable.
33479 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
33480 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
33481 DAG.getConstant(Log2b, dl, VT));
33482 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
33485 // Try to transform:
33486 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
33488 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
33489 // Will also attempt to match more generic cases, eg:
33490 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
33491 // Only applies if the target supports the FastLZCNT feature.
33492 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
33493 TargetLowering::DAGCombinerInfo &DCI,
33494 const X86Subtarget &Subtarget) {
33495 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
33498 auto isORCandidate = [](SDValue N) {
33499 return (N->getOpcode() == ISD::OR && N->hasOneUse());
33502 // Check the zero extend is extending to 32-bit or more. The code generated by
33503 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
33504 // instructions to clear the upper bits.
33505 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
33506 !isORCandidate(N->getOperand(0)))
33509 // Check the node matches: setcc(eq, cmp 0)
33510 auto isSetCCCandidate = [](SDValue N) {
33511 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
33512 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
33513 N->getOperand(1).getOpcode() == X86ISD::CMP &&
33514 isNullConstant(N->getOperand(1).getOperand(1)) &&
33515 N->getOperand(1).getValueType().bitsGE(MVT::i32);
33518 SDNode *OR = N->getOperand(0).getNode();
33519 SDValue LHS = OR->getOperand(0);
33520 SDValue RHS = OR->getOperand(1);
33522 // Save nodes matching or(or, setcc(eq, cmp 0)).
33523 SmallVector<SDNode *, 2> ORNodes;
33524 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
33525 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
33526 ORNodes.push_back(OR);
33527 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
33528 LHS = OR->getOperand(0);
33529 RHS = OR->getOperand(1);
33532 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
33533 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
33534 !isORCandidate(SDValue(OR, 0)))
33537 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
33539 // or(srl(ctlz),srl(ctlz)).
33540 // The dag combiner can then fold it into:
33541 // srl(or(ctlz, ctlz)).
33542 EVT VT = OR->getValueType(0);
33543 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
33544 SDValue Ret, NewRHS;
33545 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
33546 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
33551 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
33552 while (ORNodes.size() > 0) {
33553 OR = ORNodes.pop_back_val();
33554 LHS = OR->getOperand(0);
33555 RHS = OR->getOperand(1);
33556 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
33557 if (RHS->getOpcode() == ISD::OR)
33558 std::swap(LHS, RHS);
33559 EVT VT = OR->getValueType(0);
33560 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
33563 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
33567 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
33572 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
33573 TargetLowering::DAGCombinerInfo &DCI,
33574 const X86Subtarget &Subtarget) {
33575 SDValue N0 = N->getOperand(0);
33576 SDValue N1 = N->getOperand(1);
33577 EVT VT = N->getValueType(0);
33579 // If this is SSE1 only convert to FOR to avoid scalarization.
33580 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33581 return DAG.getBitcast(MVT::v4i32,
33582 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
33583 DAG.getBitcast(MVT::v4f32, N0),
33584 DAG.getBitcast(MVT::v4f32, N1)));
33587 if (DCI.isBeforeLegalizeOps())
33590 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33593 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33596 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
33599 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
33602 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
33603 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
33605 // SHLD/SHRD instructions have lower register pressure, but on some
33606 // platforms they have higher latency than the equivalent
33607 // series of shifts/or that would otherwise be generated.
33608 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
33609 // have higher latencies and we are not optimizing for size.
33610 if (!OptForSize && Subtarget.isSHLDSlow())
33613 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
33615 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
33617 if (!N0.hasOneUse() || !N1.hasOneUse())
33620 SDValue ShAmt0 = N0.getOperand(1);
33621 if (ShAmt0.getValueType() != MVT::i8)
33623 SDValue ShAmt1 = N1.getOperand(1);
33624 if (ShAmt1.getValueType() != MVT::i8)
33626 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
33627 ShAmt0 = ShAmt0.getOperand(0);
33628 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
33629 ShAmt1 = ShAmt1.getOperand(0);
33632 unsigned Opc = X86ISD::SHLD;
33633 SDValue Op0 = N0.getOperand(0);
33634 SDValue Op1 = N1.getOperand(0);
33635 if (ShAmt0.getOpcode() == ISD::SUB ||
33636 ShAmt0.getOpcode() == ISD::XOR) {
33637 Opc = X86ISD::SHRD;
33638 std::swap(Op0, Op1);
33639 std::swap(ShAmt0, ShAmt1);
33642 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
33643 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
33644 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
33645 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
33646 unsigned Bits = VT.getSizeInBits();
33647 if (ShAmt1.getOpcode() == ISD::SUB) {
33648 SDValue Sum = ShAmt1.getOperand(0);
33649 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
33650 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
33651 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
33652 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
33653 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
33654 return DAG.getNode(Opc, DL, VT,
33656 DAG.getNode(ISD::TRUNCATE, DL,
33659 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
33660 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
33661 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
33662 return DAG.getNode(Opc, DL, VT,
33663 N0.getOperand(0), N1.getOperand(0),
33664 DAG.getNode(ISD::TRUNCATE, DL,
33666 } else if (ShAmt1.getOpcode() == ISD::XOR) {
33667 SDValue Mask = ShAmt1.getOperand(1);
33668 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
33669 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
33670 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
33671 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
33672 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
33673 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
33674 if (Op1.getOpcode() == InnerShift &&
33675 isa<ConstantSDNode>(Op1.getOperand(1)) &&
33676 Op1.getConstantOperandVal(1) == 1) {
33677 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
33678 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
33680 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
33681 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
33682 Op1.getOperand(0) == Op1.getOperand(1)) {
33683 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
33684 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
33693 /// Try to turn tests against the signbit in the form of:
33694 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
33697 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
33698 // This is only worth doing if the output type is i8 or i1.
33699 EVT ResultType = N->getValueType(0);
33700 if (ResultType != MVT::i8 && ResultType != MVT::i1)
33703 SDValue N0 = N->getOperand(0);
33704 SDValue N1 = N->getOperand(1);
33706 // We should be performing an xor against a truncated shift.
33707 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
33710 // Make sure we are performing an xor against one.
33711 if (!isOneConstant(N1))
33714 // SetCC on x86 zero extends so only act on this if it's a logical shift.
33715 SDValue Shift = N0.getOperand(0);
33716 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
33719 // Make sure we are truncating from one of i16, i32 or i64.
33720 EVT ShiftTy = Shift.getValueType();
33721 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
33724 // Make sure the shift amount extracts the sign bit.
33725 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
33726 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
33729 // Create a greater-than comparison against -1.
33730 // N.B. Using SETGE against 0 works but we want a canonical looking
33731 // comparison, using SETGT matches up with what TranslateX86CC.
33733 SDValue ShiftOp = Shift.getOperand(0);
33734 EVT ShiftOpTy = ShiftOp.getValueType();
33735 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33736 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
33737 *DAG.getContext(), ResultType);
33738 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
33739 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
33740 if (SetCCResultType != ResultType)
33741 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
33745 /// Turn vector tests of the signbit in the form of:
33746 /// xor (sra X, elt_size(X)-1), -1
33750 /// This should be called before type legalization because the pattern may not
33751 /// persist after that.
33752 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
33753 const X86Subtarget &Subtarget) {
33754 EVT VT = N->getValueType(0);
33755 if (!VT.isSimple())
33758 switch (VT.getSimpleVT().SimpleTy) {
33759 default: return SDValue();
33762 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
33763 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
33767 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
33770 // There must be a shift right algebraic before the xor, and the xor must be a
33771 // 'not' operation.
33772 SDValue Shift = N->getOperand(0);
33773 SDValue Ones = N->getOperand(1);
33774 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
33775 !ISD::isBuildVectorAllOnes(Ones.getNode()))
33778 // The shift should be smearing the sign bit across each vector element.
33779 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
33783 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
33784 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
33785 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
33788 // Create a greater-than comparison against -1. We don't use the more obvious
33789 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
33790 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
33793 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
33794 /// is valid for the given \p Subtarget.
33795 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
33796 const X86Subtarget &Subtarget) {
33797 if (!Subtarget.hasAVX512())
33800 // FIXME: Scalar type may be supported if we move it to vector register.
33801 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
33804 EVT SrcElVT = SrcVT.getScalarType();
33805 EVT DstElVT = DstVT.getScalarType();
33806 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
33808 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
33810 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
33811 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
33815 /// Detect a pattern of truncation with saturation:
33816 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
33817 /// Return the source value to be truncated or SDValue() if the pattern was not
33819 static SDValue detectUSatPattern(SDValue In, EVT VT) {
33820 if (In.getOpcode() != ISD::UMIN)
33823 //Saturation with truncation. We truncate from InVT to VT.
33824 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
33825 "Unexpected types for truncate operation");
33828 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
33829 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
33830 // the element size of the destination type.
33831 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
33837 /// Detect a pattern of truncation with saturation:
33838 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
33839 /// The types should allow to use VPMOVUS* instruction on AVX512.
33840 /// Return the source value to be truncated or SDValue() if the pattern was not
33842 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
33843 const X86Subtarget &Subtarget) {
33844 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
33846 return detectUSatPattern(In, VT);
33850 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
33851 const X86Subtarget &Subtarget) {
33852 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33853 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
33855 if (auto USatVal = detectUSatPattern(In, VT))
33856 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
33857 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
33861 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
33862 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
33863 /// X86ISD::AVG instruction.
33864 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
33865 const X86Subtarget &Subtarget,
33867 if (!VT.isVector() || !VT.isSimple())
33869 EVT InVT = In.getValueType();
33870 unsigned NumElems = VT.getVectorNumElements();
33872 EVT ScalarVT = VT.getVectorElementType();
33873 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
33874 isPowerOf2_32(NumElems)))
33877 // InScalarVT is the intermediate type in AVG pattern and it should be greater
33878 // than the original input type (i8/i16).
33879 EVT InScalarVT = InVT.getVectorElementType();
33880 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
33883 if (!Subtarget.hasSSE2())
33885 if (Subtarget.hasBWI()) {
33886 if (VT.getSizeInBits() > 512)
33888 } else if (Subtarget.hasAVX2()) {
33889 if (VT.getSizeInBits() > 256)
33892 if (VT.getSizeInBits() > 128)
33896 // Detect the following pattern:
33898 // %1 = zext <N x i8> %a to <N x i32>
33899 // %2 = zext <N x i8> %b to <N x i32>
33900 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
33901 // %4 = add nuw nsw <N x i32> %3, %2
33902 // %5 = lshr <N x i32> %N, <i32 1 x N>
33903 // %6 = trunc <N x i32> %5 to <N x i8>
33905 // In AVX512, the last instruction can also be a trunc store.
33907 if (In.getOpcode() != ISD::SRL)
33910 // A lambda checking the given SDValue is a constant vector and each element
33911 // is in the range [Min, Max].
33912 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
33913 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
33914 if (!BV || !BV->isConstant())
33916 for (SDValue Op : V->ops()) {
33917 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
33920 uint64_t Val = C->getZExtValue();
33921 if (Val < Min || Val > Max)
33927 // Check if each element of the vector is left-shifted by one.
33928 auto LHS = In.getOperand(0);
33929 auto RHS = In.getOperand(1);
33930 if (!IsConstVectorInRange(RHS, 1, 1))
33932 if (LHS.getOpcode() != ISD::ADD)
33935 // Detect a pattern of a + b + 1 where the order doesn't matter.
33936 SDValue Operands[3];
33937 Operands[0] = LHS.getOperand(0);
33938 Operands[1] = LHS.getOperand(1);
33940 // Take care of the case when one of the operands is a constant vector whose
33941 // element is in the range [1, 256].
33942 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
33943 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
33944 Operands[0].getOperand(0).getValueType() == VT) {
33945 // The pattern is detected. Subtract one from the constant vector, then
33946 // demote it and emit X86ISD::AVG instruction.
33947 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
33948 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
33949 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
33950 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
33954 if (Operands[0].getOpcode() == ISD::ADD)
33955 std::swap(Operands[0], Operands[1]);
33956 else if (Operands[1].getOpcode() != ISD::ADD)
33958 Operands[2] = Operands[1].getOperand(0);
33959 Operands[1] = Operands[1].getOperand(1);
33961 // Now we have three operands of two additions. Check that one of them is a
33962 // constant vector with ones, and the other two are promoted from i8/i16.
33963 for (int i = 0; i < 3; ++i) {
33964 if (!IsConstVectorInRange(Operands[i], 1, 1))
33966 std::swap(Operands[i], Operands[2]);
33968 // Check if Operands[0] and Operands[1] are results of type promotion.
33969 for (int j = 0; j < 2; ++j)
33970 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
33971 Operands[j].getOperand(0).getValueType() != VT)
33974 // The pattern is detected, emit X86ISD::AVG instruction.
33975 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
33976 Operands[1].getOperand(0));
33982 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
33983 TargetLowering::DAGCombinerInfo &DCI,
33984 const X86Subtarget &Subtarget) {
33985 LoadSDNode *Ld = cast<LoadSDNode>(N);
33986 EVT RegVT = Ld->getValueType(0);
33987 EVT MemVT = Ld->getMemoryVT();
33989 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33991 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
33992 // into two 16-byte operations. Also split non-temporal aligned loads on
33993 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
33994 ISD::LoadExtType Ext = Ld->getExtensionType();
33996 unsigned AddressSpace = Ld->getAddressSpace();
33997 unsigned Alignment = Ld->getAlignment();
33998 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
33999 Ext == ISD::NON_EXTLOAD &&
34000 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
34001 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
34002 AddressSpace, Alignment, &Fast) && !Fast))) {
34003 unsigned NumElems = RegVT.getVectorNumElements();
34007 SDValue Ptr = Ld->getBasePtr();
34009 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
34012 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
34013 Alignment, Ld->getMemOperand()->getFlags());
34015 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
34017 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
34018 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
34019 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34021 Load2.getValue(1));
34023 SDValue NewVec = DAG.getUNDEF(RegVT);
34024 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
34025 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
34026 return DCI.CombineTo(N, NewVec, TF, true);
34032 /// If V is a build vector of boolean constants and exactly one of those
34033 /// constants is true, return the operand index of that true element.
34034 /// Otherwise, return -1.
34035 static int getOneTrueElt(SDValue V) {
34036 // This needs to be a build vector of booleans.
34037 // TODO: Checking for the i1 type matches the IR definition for the mask,
34038 // but the mask check could be loosened to i8 or other types. That might
34039 // also require checking more than 'allOnesValue'; eg, the x86 HW
34040 // instructions only require that the MSB is set for each mask element.
34041 // The ISD::MSTORE comments/definition do not specify how the mask operand
34043 auto *BV = dyn_cast<BuildVectorSDNode>(V);
34044 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
34047 int TrueIndex = -1;
34048 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
34049 for (unsigned i = 0; i < NumElts; ++i) {
34050 const SDValue &Op = BV->getOperand(i);
34053 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
34056 if (ConstNode->getAPIntValue().isAllOnesValue()) {
34057 // If we already found a one, this is too many.
34058 if (TrueIndex >= 0)
34066 /// Given a masked memory load/store operation, return true if it has one mask
34067 /// bit set. If it has one mask bit set, then also return the memory address of
34068 /// the scalar element to load/store, the vector index to insert/extract that
34069 /// scalar element, and the alignment for the scalar memory access.
34070 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
34071 SelectionDAG &DAG, SDValue &Addr,
34072 SDValue &Index, unsigned &Alignment) {
34073 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
34074 if (TrueMaskElt < 0)
34077 // Get the address of the one scalar element that is specified by the mask
34078 // using the appropriate offset from the base pointer.
34079 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
34080 Addr = MaskedOp->getBasePtr();
34081 if (TrueMaskElt != 0) {
34082 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
34083 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
34086 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
34087 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
34091 /// If exactly one element of the mask is set for a non-extending masked load,
34092 /// it is a scalar load and vector insert.
34093 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34094 /// mask have already been optimized in IR, so we don't bother with those here.
34096 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34097 TargetLowering::DAGCombinerInfo &DCI) {
34098 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34099 // However, some target hooks may need to be added to know when the transform
34100 // is profitable. Endianness would also have to be considered.
34102 SDValue Addr, VecIndex;
34103 unsigned Alignment;
34104 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
34107 // Load the one scalar element that is specified by the mask using the
34108 // appropriate offset from the base pointer.
34110 EVT VT = ML->getValueType(0);
34111 EVT EltVT = VT.getVectorElementType();
34113 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
34114 Alignment, ML->getMemOperand()->getFlags());
34116 // Insert the loaded element into the appropriate place in the vector.
34117 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
34119 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
34123 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34124 TargetLowering::DAGCombinerInfo &DCI) {
34125 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
34129 EVT VT = ML->getValueType(0);
34131 // If we are loading the first and last elements of a vector, it is safe and
34132 // always faster to load the whole vector. Replace the masked load with a
34133 // vector load and select.
34134 unsigned NumElts = VT.getVectorNumElements();
34135 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
34136 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
34137 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
34138 if (LoadFirstElt && LoadLastElt) {
34139 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34140 ML->getMemOperand());
34141 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
34142 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
34145 // Convert a masked load with a constant mask into a masked load and a select.
34146 // This allows the select operation to use a faster kind of select instruction
34147 // (for example, vblendvps -> vblendps).
34149 // Don't try this if the pass-through operand is already undefined. That would
34150 // cause an infinite loop because that's what we're about to create.
34151 if (ML->getSrc0().isUndef())
34154 // The new masked load has an undef pass-through operand. The select uses the
34155 // original pass-through operand.
34156 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34157 ML->getMask(), DAG.getUNDEF(VT),
34158 ML->getMemoryVT(), ML->getMemOperand(),
34159 ML->getExtensionType());
34160 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
34162 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
34165 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
34166 TargetLowering::DAGCombinerInfo &DCI,
34167 const X86Subtarget &Subtarget) {
34168 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
34170 // TODO: Expanding load with constant mask may be optimized as well.
34171 if (Mld->isExpandingLoad())
34174 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
34175 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
34177 // TODO: Do some AVX512 subsets benefit from this transform?
34178 if (!Subtarget.hasAVX512())
34179 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
34183 if (Mld->getExtensionType() != ISD::SEXTLOAD)
34186 // Resolve extending loads.
34187 EVT VT = Mld->getValueType(0);
34188 unsigned NumElems = VT.getVectorNumElements();
34189 EVT LdVT = Mld->getMemoryVT();
34192 assert(LdVT != VT && "Cannot extend to the same type");
34193 unsigned ToSz = VT.getScalarSizeInBits();
34194 unsigned FromSz = LdVT.getScalarSizeInBits();
34195 // From/To sizes and ElemCount must be pow of two.
34196 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34197 "Unexpected size for extending masked load");
34199 unsigned SizeRatio = ToSz / FromSz;
34200 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
34202 // Create a type on which we perform the shuffle.
34203 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34204 LdVT.getScalarType(), NumElems*SizeRatio);
34205 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34207 // Convert Src0 value.
34208 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
34209 if (!Mld->getSrc0().isUndef()) {
34210 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34211 for (unsigned i = 0; i != NumElems; ++i)
34212 ShuffleVec[i] = i * SizeRatio;
34214 // Can't shuffle using an illegal type.
34215 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34216 "WideVecVT should be legal");
34217 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
34218 DAG.getUNDEF(WideVecVT), ShuffleVec);
34221 // Prepare the new mask.
34223 SDValue Mask = Mld->getMask();
34224 if (Mask.getValueType() == VT) {
34225 // Mask and original value have the same type.
34226 NewMask = DAG.getBitcast(WideVecVT, Mask);
34227 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34228 for (unsigned i = 0; i != NumElems; ++i)
34229 ShuffleVec[i] = i * SizeRatio;
34230 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
34231 ShuffleVec[i] = NumElems * SizeRatio;
34232 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34233 DAG.getConstant(0, dl, WideVecVT),
34236 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34237 unsigned WidenNumElts = NumElems*SizeRatio;
34238 unsigned MaskNumElts = VT.getVectorNumElements();
34239 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34242 unsigned NumConcat = WidenNumElts / MaskNumElts;
34243 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34244 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34246 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34249 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
34250 Mld->getBasePtr(), NewMask, WideSrc0,
34251 Mld->getMemoryVT(), Mld->getMemOperand(),
34253 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
34254 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
34257 /// If exactly one element of the mask is set for a non-truncating masked store,
34258 /// it is a vector extract and scalar store.
34259 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34260 /// mask have already been optimized in IR, so we don't bother with those here.
34261 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
34262 SelectionDAG &DAG) {
34263 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34264 // However, some target hooks may need to be added to know when the transform
34265 // is profitable. Endianness would also have to be considered.
34267 SDValue Addr, VecIndex;
34268 unsigned Alignment;
34269 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
34272 // Extract the one scalar element that is actually being stored.
34274 EVT VT = MS->getValue().getValueType();
34275 EVT EltVT = VT.getVectorElementType();
34276 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
34277 MS->getValue(), VecIndex);
34279 // Store that element at the appropriate offset from the base pointer.
34280 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
34281 Alignment, MS->getMemOperand()->getFlags());
34284 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
34285 const X86Subtarget &Subtarget) {
34286 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
34288 if (Mst->isCompressingStore())
34291 if (!Mst->isTruncatingStore()) {
34292 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
34293 return ScalarStore;
34295 // If the mask is checking (0 > X), we're creating a vector with all-zeros
34296 // or all-ones elements based on the sign bits of X. AVX1 masked store only
34297 // cares about the sign bit of each mask element, so eliminate the compare:
34298 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
34299 // Note that by waiting to match an x86-specific PCMPGT node, we're
34300 // eliminating potentially more complex matching of a setcc node which has
34301 // a full range of predicates.
34302 SDValue Mask = Mst->getMask();
34303 if (Mask.getOpcode() == X86ISD::PCMPGT &&
34304 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
34305 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
34306 "Unexpected type for PCMPGT");
34307 return DAG.getMaskedStore(
34308 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
34309 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
34312 // TODO: AVX512 targets should also be able to simplify something like the
34313 // pattern above, but that pattern will be different. It will either need to
34314 // match setcc more generally or match PCMPGTM later (in tablegen?).
34319 // Resolve truncating stores.
34320 EVT VT = Mst->getValue().getValueType();
34321 unsigned NumElems = VT.getVectorNumElements();
34322 EVT StVT = Mst->getMemoryVT();
34325 assert(StVT != VT && "Cannot truncate to the same type");
34326 unsigned FromSz = VT.getScalarSizeInBits();
34327 unsigned ToSz = StVT.getScalarSizeInBits();
34329 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34331 // The truncating store is legal in some cases. For example
34332 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34333 // are designated for truncate store.
34334 // In this case we don't need any further transformations.
34335 if (TLI.isTruncStoreLegal(VT, StVT))
34338 // From/To sizes and ElemCount must be pow of two.
34339 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34340 "Unexpected size for truncating masked store");
34341 // We are going to use the original vector elt for storing.
34342 // Accumulated smaller vector elements must be a multiple of the store size.
34343 assert (((NumElems * FromSz) % ToSz) == 0 &&
34344 "Unexpected ratio for truncating masked store");
34346 unsigned SizeRatio = FromSz / ToSz;
34347 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34349 // Create a type on which we perform the shuffle.
34350 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34351 StVT.getScalarType(), NumElems*SizeRatio);
34353 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34355 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
34356 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34357 for (unsigned i = 0; i != NumElems; ++i)
34358 ShuffleVec[i] = i * SizeRatio;
34360 // Can't shuffle using an illegal type.
34361 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34362 "WideVecVT should be legal");
34364 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34365 DAG.getUNDEF(WideVecVT),
34369 SDValue Mask = Mst->getMask();
34370 if (Mask.getValueType() == VT) {
34371 // Mask and original value have the same type.
34372 NewMask = DAG.getBitcast(WideVecVT, Mask);
34373 for (unsigned i = 0; i != NumElems; ++i)
34374 ShuffleVec[i] = i * SizeRatio;
34375 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
34376 ShuffleVec[i] = NumElems*SizeRatio;
34377 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34378 DAG.getConstant(0, dl, WideVecVT),
34381 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34382 unsigned WidenNumElts = NumElems*SizeRatio;
34383 unsigned MaskNumElts = VT.getVectorNumElements();
34384 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34387 unsigned NumConcat = WidenNumElts / MaskNumElts;
34388 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34389 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34391 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34394 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
34395 Mst->getBasePtr(), NewMask, StVT,
34396 Mst->getMemOperand(), false);
34399 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
34400 const X86Subtarget &Subtarget) {
34401 StoreSDNode *St = cast<StoreSDNode>(N);
34402 EVT VT = St->getValue().getValueType();
34403 EVT StVT = St->getMemoryVT();
34405 SDValue StoredVal = St->getOperand(1);
34406 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34408 // If we are saving a concatenation of two XMM registers and 32-byte stores
34409 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
34411 unsigned AddressSpace = St->getAddressSpace();
34412 unsigned Alignment = St->getAlignment();
34413 if (VT.is256BitVector() && StVT == VT &&
34414 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
34415 AddressSpace, Alignment, &Fast) &&
34417 unsigned NumElems = VT.getVectorNumElements();
34421 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
34422 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
34424 SDValue Ptr0 = St->getBasePtr();
34425 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
34428 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
34429 Alignment, St->getMemOperand()->getFlags());
34431 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
34432 std::min(16U, Alignment), St->getMemOperand()->getFlags());
34433 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
34436 // Optimize trunc store (of multiple scalars) to shuffle and store.
34437 // First, pack all of the elements in one place. Next, store to memory
34438 // in fewer chunks.
34439 if (St->isTruncatingStore() && VT.isVector()) {
34440 // Check if we can detect an AVG pattern from the truncation. If yes,
34441 // replace the trunc store by a normal store with the result of X86ISD::AVG
34443 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
34445 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
34446 St->getPointerInfo(), St->getAlignment(),
34447 St->getMemOperand()->getFlags());
34450 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
34451 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
34452 dl, Val, St->getBasePtr(),
34453 St->getMemoryVT(), St->getMemOperand(), DAG);
34455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34456 unsigned NumElems = VT.getVectorNumElements();
34457 assert(StVT != VT && "Cannot truncate to the same type");
34458 unsigned FromSz = VT.getScalarSizeInBits();
34459 unsigned ToSz = StVT.getScalarSizeInBits();
34461 // The truncating store is legal in some cases. For example
34462 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34463 // are designated for truncate store.
34464 // In this case we don't need any further transformations.
34465 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
34468 // From, To sizes and ElemCount must be pow of two
34469 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
34470 // We are going to use the original vector elt for storing.
34471 // Accumulated smaller vector elements must be a multiple of the store size.
34472 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
34474 unsigned SizeRatio = FromSz / ToSz;
34476 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34478 // Create a type on which we perform the shuffle
34479 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34480 StVT.getScalarType(), NumElems*SizeRatio);
34482 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34484 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
34485 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
34486 for (unsigned i = 0; i != NumElems; ++i)
34487 ShuffleVec[i] = i * SizeRatio;
34489 // Can't shuffle using an illegal type.
34490 if (!TLI.isTypeLegal(WideVecVT))
34493 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34494 DAG.getUNDEF(WideVecVT),
34496 // At this point all of the data is stored at the bottom of the
34497 // register. We now need to save it to mem.
34499 // Find the largest store unit
34500 MVT StoreType = MVT::i8;
34501 for (MVT Tp : MVT::integer_valuetypes()) {
34502 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
34506 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
34507 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
34508 (64 <= NumElems * ToSz))
34509 StoreType = MVT::f64;
34511 // Bitcast the original vector into a vector of store-size units
34512 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
34513 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
34514 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
34515 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
34516 SmallVector<SDValue, 8> Chains;
34517 SDValue Ptr = St->getBasePtr();
34519 // Perform one or more big stores into memory.
34520 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
34521 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
34522 StoreType, ShuffWide,
34523 DAG.getIntPtrConstant(i, dl));
34525 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
34526 St->getAlignment(), St->getMemOperand()->getFlags());
34527 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
34528 Chains.push_back(Ch);
34531 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
34534 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
34535 // the FP state in cases where an emms may be missing.
34536 // A preferable solution to the general problem is to figure out the right
34537 // places to insert EMMS. This qualifies as a quick hack.
34539 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
34540 if (VT.getSizeInBits() != 64)
34543 const Function &F = DAG.getMachineFunction().getFunction();
34544 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
34546 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
34547 if ((VT.isVector() ||
34548 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
34549 isa<LoadSDNode>(St->getValue()) &&
34550 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
34551 St->getChain().hasOneUse() && !St->isVolatile()) {
34552 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
34553 SmallVector<SDValue, 8> Ops;
34555 if (!ISD::isNormalLoad(Ld))
34558 // If this is not the MMX case, i.e. we are just turning i64 load/store
34559 // into f64 load/store, avoid the transformation if there are multiple
34560 // uses of the loaded value.
34561 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
34566 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
34567 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
34569 if (Subtarget.is64Bit() || F64IsLegal) {
34570 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
34571 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
34572 Ld->getMemOperand());
34574 // Make sure new load is placed in same chain order.
34575 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
34576 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
34577 St->getMemOperand());
34580 // Otherwise, lower to two pairs of 32-bit loads / stores.
34581 SDValue LoAddr = Ld->getBasePtr();
34582 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
34584 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
34585 Ld->getPointerInfo(), Ld->getAlignment(),
34586 Ld->getMemOperand()->getFlags());
34587 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
34588 Ld->getPointerInfo().getWithOffset(4),
34589 MinAlign(Ld->getAlignment(), 4),
34590 Ld->getMemOperand()->getFlags());
34591 // Make sure new loads are placed in same chain order.
34592 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
34593 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
34595 LoAddr = St->getBasePtr();
34596 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
34599 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
34600 St->getAlignment(), St->getMemOperand()->getFlags());
34601 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
34602 St->getPointerInfo().getWithOffset(4),
34603 MinAlign(St->getAlignment(), 4),
34604 St->getMemOperand()->getFlags());
34605 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
34608 // This is similar to the above case, but here we handle a scalar 64-bit
34609 // integer store that is extracted from a vector on a 32-bit target.
34610 // If we have SSE2, then we can treat it like a floating-point double
34611 // to get past legalization. The execution dependencies fixup pass will
34612 // choose the optimal machine instruction for the store if this really is
34613 // an integer or v2f32 rather than an f64.
34614 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
34615 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
34616 SDValue OldExtract = St->getOperand(1);
34617 SDValue ExtOp0 = OldExtract.getOperand(0);
34618 unsigned VecSize = ExtOp0.getValueSizeInBits();
34619 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
34620 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
34621 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
34622 BitCast, OldExtract.getOperand(1));
34623 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
34624 St->getPointerInfo(), St->getAlignment(),
34625 St->getMemOperand()->getFlags());
34631 /// Return 'true' if this vector operation is "horizontal"
34632 /// and return the operands for the horizontal operation in LHS and RHS. A
34633 /// horizontal operation performs the binary operation on successive elements
34634 /// of its first operand, then on successive elements of its second operand,
34635 /// returning the resulting values in a vector. For example, if
34636 /// A = < float a0, float a1, float a2, float a3 >
34638 /// B = < float b0, float b1, float b2, float b3 >
34639 /// then the result of doing a horizontal operation on A and B is
34640 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
34641 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
34642 /// A horizontal-op B, for some already available A and B, and if so then LHS is
34643 /// set to A, RHS to B, and the routine returns 'true'.
34644 /// Note that the binary operation should have the property that if one of the
34645 /// operands is UNDEF then the result is UNDEF.
34646 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
34647 // Look for the following pattern: if
34648 // A = < float a0, float a1, float a2, float a3 >
34649 // B = < float b0, float b1, float b2, float b3 >
34651 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
34652 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
34653 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
34654 // which is A horizontal-op B.
34656 // At least one of the operands should be a vector shuffle.
34657 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
34658 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
34661 MVT VT = LHS.getSimpleValueType();
34663 assert((VT.is128BitVector() || VT.is256BitVector()) &&
34664 "Unsupported vector type for horizontal add/sub");
34666 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
34667 // operate independently on 128-bit lanes.
34668 unsigned NumElts = VT.getVectorNumElements();
34669 unsigned NumLanes = VT.getSizeInBits()/128;
34670 unsigned NumLaneElts = NumElts / NumLanes;
34671 assert((NumLaneElts % 2 == 0) &&
34672 "Vector type should have an even number of elements in each lane");
34673 unsigned HalfLaneElts = NumLaneElts/2;
34675 // View LHS in the form
34676 // LHS = VECTOR_SHUFFLE A, B, LMask
34677 // If LHS is not a shuffle then pretend it is the shuffle
34678 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
34679 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
34682 SmallVector<int, 16> LMask(NumElts);
34683 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
34684 if (!LHS.getOperand(0).isUndef())
34685 A = LHS.getOperand(0);
34686 if (!LHS.getOperand(1).isUndef())
34687 B = LHS.getOperand(1);
34688 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
34689 std::copy(Mask.begin(), Mask.end(), LMask.begin());
34691 if (!LHS.isUndef())
34693 for (unsigned i = 0; i != NumElts; ++i)
34697 // Likewise, view RHS in the form
34698 // RHS = VECTOR_SHUFFLE C, D, RMask
34700 SmallVector<int, 16> RMask(NumElts);
34701 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
34702 if (!RHS.getOperand(0).isUndef())
34703 C = RHS.getOperand(0);
34704 if (!RHS.getOperand(1).isUndef())
34705 D = RHS.getOperand(1);
34706 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
34707 std::copy(Mask.begin(), Mask.end(), RMask.begin());
34709 if (!RHS.isUndef())
34711 for (unsigned i = 0; i != NumElts; ++i)
34715 // Check that the shuffles are both shuffling the same vectors.
34716 if (!(A == C && B == D) && !(A == D && B == C))
34719 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
34720 if (!A.getNode() && !B.getNode())
34723 // If A and B occur in reverse order in RHS, then "swap" them (which means
34724 // rewriting the mask).
34726 ShuffleVectorSDNode::commuteMask(RMask);
34728 // At this point LHS and RHS are equivalent to
34729 // LHS = VECTOR_SHUFFLE A, B, LMask
34730 // RHS = VECTOR_SHUFFLE A, B, RMask
34731 // Check that the masks correspond to performing a horizontal operation.
34732 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
34733 for (unsigned i = 0; i != NumLaneElts; ++i) {
34734 int LIdx = LMask[i+l], RIdx = RMask[i+l];
34736 // Ignore any UNDEF components.
34737 if (LIdx < 0 || RIdx < 0 ||
34738 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
34739 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
34742 // Check that successive elements are being operated on. If not, this is
34743 // not a horizontal operation.
34744 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
34745 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
34746 if (!(LIdx == Index && RIdx == Index + 1) &&
34747 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
34752 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
34753 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
34757 /// Do target-specific dag combines on floating-point adds/subs.
34758 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
34759 const X86Subtarget &Subtarget) {
34760 EVT VT = N->getValueType(0);
34761 SDValue LHS = N->getOperand(0);
34762 SDValue RHS = N->getOperand(1);
34763 bool IsFadd = N->getOpcode() == ISD::FADD;
34764 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
34766 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
34767 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
34768 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
34769 isHorizontalBinOp(LHS, RHS, IsFadd)) {
34770 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
34771 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
34776 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
34778 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
34779 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
34780 const X86Subtarget &Subtarget,
34782 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
34783 SDValue Src = N->getOperand(0);
34784 unsigned Opcode = Src.getOpcode();
34785 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34787 EVT VT = N->getValueType(0);
34788 EVT SrcVT = Src.getValueType();
34790 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
34791 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
34793 // Repeated operand, so we are only trading one output truncation for
34794 // one input truncation.
34798 // See if either operand has been extended from a smaller/equal size to
34799 // the truncation size, allowing a truncation to combine with the extend.
34800 unsigned Opcode0 = Op0.getOpcode();
34801 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
34802 Opcode0 == ISD::ZERO_EXTEND) &&
34803 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
34806 unsigned Opcode1 = Op1.getOpcode();
34807 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
34808 Opcode1 == ISD::ZERO_EXTEND) &&
34809 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
34812 // See if either operand is a single use constant which can be constant
34814 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
34815 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
34816 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
34817 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
34820 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
34821 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
34822 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
34823 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
34826 // Don't combine if the operation has other uses.
34827 if (!N->isOnlyUserOf(Src.getNode()))
34830 // Only support vector truncation for now.
34831 // TODO: i64 scalar math would benefit as well.
34832 if (!VT.isVector())
34835 // In most cases its only worth pre-truncating if we're only facing the cost
34836 // of one truncation.
34837 // i.e. if one of the inputs will constant fold or the input is repeated.
34842 SDValue Op0 = Src.getOperand(0);
34843 SDValue Op1 = Src.getOperand(1);
34844 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
34845 IsRepeatedOpOrFreeTruncation(Op0, Op1))
34846 return TruncateArithmetic(Op0, Op1);
34851 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
34852 // better to truncate if we have the chance.
34853 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
34854 !TLI.isOperationLegal(Opcode, SrcVT))
34855 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
34858 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
34859 SDValue Op0 = Src.getOperand(0);
34860 SDValue Op1 = Src.getOperand(1);
34861 if (TLI.isOperationLegal(Opcode, VT) &&
34862 IsRepeatedOpOrFreeTruncation(Op0, Op1))
34863 return TruncateArithmetic(Op0, Op1);
34871 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
34873 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
34874 SmallVector<SDValue, 8> &Regs) {
34875 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
34876 Regs[0].getValueType() == MVT::v2i64));
34877 EVT OutVT = N->getValueType(0);
34878 EVT OutSVT = OutVT.getVectorElementType();
34879 EVT InVT = Regs[0].getValueType();
34880 EVT InSVT = InVT.getVectorElementType();
34883 // First, use mask to unset all bits that won't appear in the result.
34884 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
34885 "OutSVT can only be either i8 or i16.");
34887 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
34888 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
34889 for (auto &Reg : Regs)
34890 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
34892 MVT UnpackedVT, PackedVT;
34893 if (OutSVT == MVT::i8) {
34894 UnpackedVT = MVT::v8i16;
34895 PackedVT = MVT::v16i8;
34897 UnpackedVT = MVT::v4i32;
34898 PackedVT = MVT::v8i16;
34901 // In each iteration, truncate the type by a half size.
34902 auto RegNum = Regs.size();
34903 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
34904 j < e; j *= 2, RegNum /= 2) {
34905 for (unsigned i = 0; i < RegNum; i++)
34906 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
34907 for (unsigned i = 0; i < RegNum / 2; i++)
34908 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
34912 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
34913 // then extract a subvector as the result since v8i8 is not a legal type.
34914 if (OutVT == MVT::v8i8) {
34915 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
34916 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
34917 DAG.getIntPtrConstant(0, DL));
34919 } else if (RegNum > 1) {
34920 Regs.resize(RegNum);
34921 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
34926 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
34928 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
34930 SmallVector<SDValue, 8> &Regs) {
34931 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
34932 EVT OutVT = N->getValueType(0);
34935 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
34936 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
34937 for (auto &Reg : Regs) {
34938 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
34940 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
34944 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
34945 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
34948 if (Regs.size() > 2) {
34949 Regs.resize(Regs.size() / 2);
34950 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
34955 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
34956 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
34957 /// legalization the truncation will be translated into a BUILD_VECTOR with each
34958 /// element that is extracted from a vector and then truncated, and it is
34959 /// difficult to do this optimization based on them.
34960 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
34961 const X86Subtarget &Subtarget) {
34962 EVT OutVT = N->getValueType(0);
34963 if (!OutVT.isVector())
34966 SDValue In = N->getOperand(0);
34967 if (!In.getValueType().isSimple())
34970 EVT InVT = In.getValueType();
34971 unsigned NumElems = OutVT.getVectorNumElements();
34973 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
34974 // SSE2, and we need to take care of it specially.
34975 // AVX512 provides vpmovdb.
34976 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
34979 EVT OutSVT = OutVT.getVectorElementType();
34980 EVT InSVT = InVT.getVectorElementType();
34981 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
34982 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
34986 // SSSE3's pshufb results in less instructions in the cases below.
34987 if (Subtarget.hasSSSE3() && NumElems == 8 &&
34988 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
34989 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
34994 // Split a long vector into vectors of legal type.
34995 unsigned RegNum = InVT.getSizeInBits() / 128;
34996 SmallVector<SDValue, 8> SubVec(RegNum);
34997 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
34998 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
35000 for (unsigned i = 0; i < RegNum; i++)
35001 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
35002 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
35004 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
35005 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
35006 // truncate 2 x v4i32 to v8i16.
35007 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
35008 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
35009 else if (InSVT == MVT::i32)
35010 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
35015 /// This function transforms vector truncation of 'extended sign-bits' or
35016 /// 'extended zero-bits' values.
35017 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
35018 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
35020 const X86Subtarget &Subtarget) {
35021 // Requires SSE2 but AVX512 has fast truncate.
35022 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
35025 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
35028 SDValue In = N->getOperand(0);
35029 if (!In.getValueType().isSimple())
35032 MVT VT = N->getValueType(0).getSimpleVT();
35033 MVT SVT = VT.getScalarType();
35035 MVT InVT = In.getValueType().getSimpleVT();
35036 MVT InSVT = InVT.getScalarType();
35038 // Check we have a truncation suited for PACKSS.
35039 if (!VT.is128BitVector() && !VT.is256BitVector())
35041 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
35043 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
35046 // Use PACKSS if the input has sign-bits that extend all the way to the
35047 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
35048 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
35049 unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
35050 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
35051 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
35053 // Use PACKUS if the input has zero-bits that extend all the way to the
35054 // packed/truncated value. e.g. masks, zext_in_reg, etc.
35056 DAG.computeKnownBits(In, Known);
35057 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
35058 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
35059 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
35060 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
35065 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
35066 const X86Subtarget &Subtarget) {
35067 EVT VT = N->getValueType(0);
35068 SDValue Src = N->getOperand(0);
35071 // Attempt to pre-truncate inputs to arithmetic ops instead.
35072 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
35075 // Try to detect AVG pattern first.
35076 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
35079 // Try to combine truncation with unsigned saturation.
35080 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
35083 // The bitcast source is a direct mmx result.
35084 // Detect bitcasts between i32 to x86mmx
35085 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
35086 SDValue BCSrc = Src.getOperand(0);
35087 if (BCSrc.getValueType() == MVT::x86mmx)
35088 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
35091 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
35092 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
35095 return combineVectorTruncation(N, DAG, Subtarget);
35098 /// Returns the negated value if the node \p N flips sign of FP value.
35100 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
35101 /// AVX512F does not have FXOR, so FNEG is lowered as
35102 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
35103 /// In this case we go though all bitcasts.
35104 static SDValue isFNEG(SDNode *N) {
35105 if (N->getOpcode() == ISD::FNEG)
35106 return N->getOperand(0);
35108 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
35109 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
35112 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
35113 if (!Op1.getValueType().isFloatingPoint())
35116 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
35118 unsigned EltBits = Op1.getScalarValueSizeInBits();
35119 auto isSignMask = [&](const ConstantFP *C) {
35120 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
35123 // There is more than one way to represent the same constant on
35124 // the different X86 targets. The type of the node may also depend on size.
35125 // - load scalar value and broadcast
35126 // - BUILD_VECTOR node
35127 // - load from a constant pool.
35128 // We check all variants here.
35129 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
35130 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
35131 if (isSignMask(cast<ConstantFP>(C)))
35134 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
35135 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
35136 if (isSignMask(CN->getConstantFPValue()))
35139 } else if (auto *C = getTargetConstantFromNode(Op1)) {
35140 if (C->getType()->isVectorTy()) {
35141 if (auto *SplatV = C->getSplatValue())
35142 if (isSignMask(cast<ConstantFP>(SplatV)))
35144 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
35145 if (isSignMask(FPConst))
35151 /// Do target-specific dag combines on floating point negations.
35152 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
35153 const X86Subtarget &Subtarget) {
35154 EVT OrigVT = N->getValueType(0);
35155 SDValue Arg = isFNEG(N);
35156 assert(Arg.getNode() && "N is expected to be an FNEG node");
35158 EVT VT = Arg.getValueType();
35159 EVT SVT = VT.getScalarType();
35162 // Let legalize expand this if it isn't a legal type yet.
35163 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
35166 // If we're negating a FMUL node on a target with FMA, then we can avoid the
35167 // use of a constant by performing (-0 - A*B) instead.
35168 // FIXME: Check rounding control flags as well once it becomes available.
35169 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
35170 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
35171 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
35172 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
35173 Arg.getOperand(1), Zero);
35174 return DAG.getBitcast(OrigVT, NewNode);
35177 // If we're negating an FMA node, then we can adjust the
35178 // instruction to include the extra negation.
35179 unsigned NewOpcode = 0;
35180 if (Arg.hasOneUse()) {
35181 switch (Arg.getOpcode()) {
35182 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
35183 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
35184 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
35185 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
35186 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
35187 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
35188 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
35189 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
35190 // We can't handle scalar intrinsic node here because it would only
35191 // invert one element and not the whole vector. But we could try to handle
35192 // a negation of the lower element only.
35196 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
35197 Arg.getNode()->ops()));
35202 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
35203 const X86Subtarget &Subtarget) {
35204 MVT VT = N->getSimpleValueType(0);
35205 // If we have integer vector types available, use the integer opcodes.
35206 if (VT.isVector() && Subtarget.hasSSE2()) {
35209 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
35211 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
35212 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
35213 unsigned IntOpcode;
35214 switch (N->getOpcode()) {
35215 default: llvm_unreachable("Unexpected FP logic op");
35216 case X86ISD::FOR: IntOpcode = ISD::OR; break;
35217 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
35218 case X86ISD::FAND: IntOpcode = ISD::AND; break;
35219 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
35221 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
35222 return DAG.getBitcast(VT, IntOp);
35228 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
35229 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
35230 if (N->getOpcode() != ISD::XOR)
35233 SDValue LHS = N->getOperand(0);
35234 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
35235 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
35238 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
35239 X86::CondCode(LHS->getConstantOperandVal(0)));
35241 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
35244 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
35245 TargetLowering::DAGCombinerInfo &DCI,
35246 const X86Subtarget &Subtarget) {
35247 // If this is SSE1 only convert to FXOR to avoid scalarization.
35248 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
35249 N->getValueType(0) == MVT::v4i32) {
35250 return DAG.getBitcast(
35251 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
35252 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
35253 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
35256 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
35259 if (DCI.isBeforeLegalizeOps())
35262 if (SDValue SetCC = foldXor1SetCC(N, DAG))
35265 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
35268 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35272 return combineFneg(N, DAG, Subtarget);
35277 static bool isNullFPScalarOrVectorConst(SDValue V) {
35278 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
35281 /// If a value is a scalar FP zero or a vector FP zero (potentially including
35282 /// undefined elements), return a zero constant that may be used to fold away
35283 /// that value. In the case of a vector, the returned constant will not contain
35284 /// undefined elements even if the input parameter does. This makes it suitable
35285 /// to be used as a replacement operand with operations (eg, bitwise-and) where
35286 /// an undef should not propagate.
35287 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
35288 const X86Subtarget &Subtarget) {
35289 if (!isNullFPScalarOrVectorConst(V))
35292 if (V.getValueType().isVector())
35293 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
35298 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
35299 const X86Subtarget &Subtarget) {
35300 SDValue N0 = N->getOperand(0);
35301 SDValue N1 = N->getOperand(1);
35302 EVT VT = N->getValueType(0);
35305 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
35306 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
35307 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
35308 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
35311 auto isAllOnesConstantFP = [](SDValue V) {
35312 if (V.getSimpleValueType().isVector())
35313 return ISD::isBuildVectorAllOnes(V.getNode());
35314 auto *C = dyn_cast<ConstantFPSDNode>(V);
35315 return C && C->getConstantFPValue()->isAllOnesValue();
35318 // fand (fxor X, -1), Y --> fandn X, Y
35319 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
35320 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
35322 // fand X, (fxor Y, -1) --> fandn Y, X
35323 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
35324 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
35329 /// Do target-specific dag combines on X86ISD::FAND nodes.
35330 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
35331 const X86Subtarget &Subtarget) {
35332 // FAND(0.0, x) -> 0.0
35333 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
35336 // FAND(x, 0.0) -> 0.0
35337 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35340 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
35343 return lowerX86FPLogicOp(N, DAG, Subtarget);
35346 /// Do target-specific dag combines on X86ISD::FANDN nodes.
35347 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
35348 const X86Subtarget &Subtarget) {
35349 // FANDN(0.0, x) -> x
35350 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35351 return N->getOperand(1);
35353 // FANDN(x, 0.0) -> 0.0
35354 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35357 return lowerX86FPLogicOp(N, DAG, Subtarget);
35360 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
35361 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
35362 const X86Subtarget &Subtarget) {
35363 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
35365 // F[X]OR(0.0, x) -> x
35366 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35367 return N->getOperand(1);
35369 // F[X]OR(x, 0.0) -> x
35370 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
35371 return N->getOperand(0);
35374 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
35377 return lowerX86FPLogicOp(N, DAG, Subtarget);
35380 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
35381 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
35382 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
35384 // Only perform optimizations if UnsafeMath is used.
35385 if (!DAG.getTarget().Options.UnsafeFPMath)
35388 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
35389 // into FMINC and FMAXC, which are Commutative operations.
35390 unsigned NewOp = 0;
35391 switch (N->getOpcode()) {
35392 default: llvm_unreachable("unknown opcode");
35393 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
35394 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
35397 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
35398 N->getOperand(0), N->getOperand(1));
35401 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
35402 const X86Subtarget &Subtarget) {
35403 if (Subtarget.useSoftFloat())
35406 // TODO: Check for global or instruction-level "nnan". In that case, we
35407 // should be able to lower to FMAX/FMIN alone.
35408 // TODO: If an operand is already known to be a NaN or not a NaN, this
35409 // should be an optional swap and FMAX/FMIN.
35411 EVT VT = N->getValueType(0);
35412 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
35413 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
35414 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
35417 // This takes at least 3 instructions, so favor a library call when operating
35418 // on a scalar and minimizing code size.
35419 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
35422 SDValue Op0 = N->getOperand(0);
35423 SDValue Op1 = N->getOperand(1);
35425 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
35426 DAG.getDataLayout(), *DAG.getContext(), VT);
35428 // There are 4 possibilities involving NaN inputs, and these are the required
35432 // ----------------
35433 // Num | Max | Op0 |
35434 // Op0 ----------------
35435 // NaN | Op1 | NaN |
35436 // ----------------
35438 // The SSE FP max/min instructions were not designed for this case, but rather
35440 // Min = Op1 < Op0 ? Op1 : Op0
35441 // Max = Op1 > Op0 ? Op1 : Op0
35443 // So they always return Op0 if either input is a NaN. However, we can still
35444 // use those instructions for fmaxnum by selecting away a NaN input.
35446 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
35447 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
35448 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
35449 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
35451 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
35452 // are NaN, the NaN value of Op1 is the result.
35453 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
35456 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
35457 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
35458 TargetLowering::DAGCombinerInfo &DCI,
35459 const X86Subtarget &Subtarget) {
35460 // ANDNP(0, x) -> x
35461 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
35462 return N->getOperand(1);
35464 // ANDNP(x, 0) -> 0
35465 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
35466 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
35468 EVT VT = N->getValueType(0);
35470 // Attempt to recursively combine a bitmask ANDNP with shuffles.
35471 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
35473 if (SDValue Res = combineX86ShufflesRecursively(
35474 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35475 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
35476 DCI.CombineTo(N, Res);
35484 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
35485 TargetLowering::DAGCombinerInfo &DCI) {
35486 SDValue N0 = N->getOperand(0);
35487 SDValue N1 = N->getOperand(1);
35489 // BT ignores high bits in the bit index operand.
35490 unsigned BitWidth = N1.getValueSizeInBits();
35491 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
35492 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
35493 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
35498 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
35499 const X86Subtarget &Subtarget) {
35500 EVT VT = N->getValueType(0);
35501 if (!VT.isVector())
35504 SDValue N0 = N->getOperand(0);
35505 SDValue N1 = N->getOperand(1);
35506 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
35509 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
35510 // both SSE and AVX2 since there is no sign-extended shift right
35511 // operation on a vector with 64-bit elements.
35512 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
35513 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
35514 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
35515 N0.getOpcode() == ISD::SIGN_EXTEND)) {
35516 SDValue N00 = N0.getOperand(0);
35518 // EXTLOAD has a better solution on AVX2,
35519 // it may be replaced with X86ISD::VSEXT node.
35520 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
35521 if (!ISD::isNormalLoad(N00.getNode()))
35524 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
35525 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
35527 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
35533 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
35534 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
35535 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
35536 /// opportunities to combine math ops, use an LEA, or use a complex addressing
35537 /// mode. This can eliminate extend, add, and shift instructions.
35538 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
35539 const X86Subtarget &Subtarget) {
35540 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
35541 Ext->getOpcode() != ISD::ZERO_EXTEND)
35544 // TODO: This should be valid for other integer types.
35545 EVT VT = Ext->getValueType(0);
35546 if (VT != MVT::i64)
35549 SDValue Add = Ext->getOperand(0);
35550 if (Add.getOpcode() != ISD::ADD)
35553 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
35554 bool NSW = Add->getFlags().hasNoSignedWrap();
35555 bool NUW = Add->getFlags().hasNoUnsignedWrap();
35557 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
35559 if ((Sext && !NSW) || (!Sext && !NUW))
35562 // Having a constant operand to the 'add' ensures that we are not increasing
35563 // the instruction count because the constant is extended for free below.
35564 // A constant operand can also become the displacement field of an LEA.
35565 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
35569 // Don't make the 'add' bigger if there's no hope of combining it with some
35570 // other 'add' or 'shl' instruction.
35571 // TODO: It may be profitable to generate simpler LEA instructions in place
35572 // of single 'add' instructions, but the cost model for selecting an LEA
35573 // currently has a high threshold.
35574 bool HasLEAPotential = false;
35575 for (auto *User : Ext->uses()) {
35576 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
35577 HasLEAPotential = true;
35581 if (!HasLEAPotential)
35584 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
35585 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
35586 SDValue AddOp0 = Add.getOperand(0);
35587 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
35588 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
35590 // The wider add is guaranteed to not wrap because both operands are
35593 Flags.setNoSignedWrap(NSW);
35594 Flags.setNoUnsignedWrap(NUW);
35595 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
35598 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
35599 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
35600 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
35601 /// extends from AH (which we otherwise need to do contortions to access).
35602 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
35603 SDValue N0 = N->getOperand(0);
35604 auto OpcodeN = N->getOpcode();
35605 auto OpcodeN0 = N0.getOpcode();
35606 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
35607 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
35610 EVT VT = N->getValueType(0);
35611 EVT InVT = N0.getValueType();
35612 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
35613 !(VT == MVT::i32 || VT == MVT::i64))
35616 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
35617 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
35618 : X86ISD::UDIVREM8_ZEXT_HREG;
35619 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
35621 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
35622 // If this was a 64-bit extend, complete it.
35623 if (VT == MVT::i64)
35624 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
35625 return R.getValue(1);
35628 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
35629 // operands and the result of CMOV is not used anywhere else - promote CMOV
35630 // itself instead of promoting its result. This could be beneficial, because:
35631 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
35632 // (or more) pseudo-CMOVs only when they go one-after-another and
35633 // getting rid of result extension code after CMOV will help that.
35634 // 2) Promotion of constant CMOV arguments is free, hence the
35635 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
35636 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
35637 // promotion is also good in terms of code-size.
35638 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
35640 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
35641 SDValue CMovN = Extend->getOperand(0);
35642 if (CMovN.getOpcode() != X86ISD::CMOV)
35645 EVT TargetVT = Extend->getValueType(0);
35646 unsigned ExtendOpcode = Extend->getOpcode();
35649 EVT VT = CMovN.getValueType();
35650 SDValue CMovOp0 = CMovN.getOperand(0);
35651 SDValue CMovOp1 = CMovN.getOperand(1);
35653 bool DoPromoteCMOV =
35654 (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
35655 CMovN.hasOneUse() &&
35656 (isa<ConstantSDNode>(CMovOp0.getNode()) &&
35657 isa<ConstantSDNode>(CMovOp1.getNode()));
35659 if (!DoPromoteCMOV)
35662 CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
35663 CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
35665 return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
35666 CMovN.getOperand(2), CMovN.getOperand(3));
35669 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
35670 // This is more or less the reverse of combineBitcastvxi1.
35672 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
35673 TargetLowering::DAGCombinerInfo &DCI,
35674 const X86Subtarget &Subtarget) {
35675 unsigned Opcode = N->getOpcode();
35676 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
35677 Opcode != ISD::ANY_EXTEND)
35679 if (!DCI.isBeforeLegalizeOps())
35681 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
35684 SDValue N0 = N->getOperand(0);
35685 EVT VT = N->getValueType(0);
35686 EVT SVT = VT.getScalarType();
35687 EVT InSVT = N0.getValueType().getScalarType();
35688 unsigned EltSizeInBits = SVT.getSizeInBits();
35690 // Input type must be extending a bool vector (bit-casted from a scalar
35691 // integer) to legal integer types.
35692 if (!VT.isVector())
35694 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
35696 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
35699 SDValue N00 = N0.getOperand(0);
35700 EVT SclVT = N0.getOperand(0).getValueType();
35701 if (!SclVT.isScalarInteger())
35706 SmallVector<int, 32> ShuffleMask;
35707 unsigned NumElts = VT.getVectorNumElements();
35708 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
35710 // Broadcast the scalar integer to the vector elements.
35711 if (NumElts > EltSizeInBits) {
35712 // If the scalar integer is greater than the vector element size, then we
35713 // must split it down into sub-sections for broadcasting. For example:
35714 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
35715 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
35716 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
35717 unsigned Scale = NumElts / EltSizeInBits;
35719 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
35720 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
35721 Vec = DAG.getBitcast(VT, Vec);
35723 for (unsigned i = 0; i != Scale; ++i)
35724 ShuffleMask.append(EltSizeInBits, i);
35726 // For smaller scalar integers, we can simply any-extend it to the vector
35727 // element size (we don't care about the upper bits) and broadcast it to all
35729 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
35730 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
35731 ShuffleMask.append(NumElts, 0);
35733 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
35735 // Now, mask the relevant bit in each element.
35736 SmallVector<SDValue, 32> Bits;
35737 for (unsigned i = 0; i != NumElts; ++i) {
35738 int BitIdx = (i % EltSizeInBits);
35739 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
35740 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
35742 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
35743 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
35745 // Compare against the bitmask and extend the result.
35746 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
35747 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
35748 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
35750 // For SEXT, this is now done, otherwise shift the result down for
35752 if (Opcode == ISD::SIGN_EXTEND)
35754 return DAG.getNode(ISD::SRL, DL, VT, Vec,
35755 DAG.getConstant(EltSizeInBits - 1, DL, VT));
35758 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
35759 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
35760 /// with UNDEFs) of the input to vectors of the same size as the target type
35761 /// which then extends the lowest elements.
35762 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
35763 TargetLowering::DAGCombinerInfo &DCI,
35764 const X86Subtarget &Subtarget) {
35765 unsigned Opcode = N->getOpcode();
35766 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
35768 if (!DCI.isBeforeLegalizeOps())
35770 if (!Subtarget.hasSSE2())
35773 SDValue N0 = N->getOperand(0);
35774 EVT VT = N->getValueType(0);
35775 EVT SVT = VT.getScalarType();
35776 EVT InVT = N0.getValueType();
35777 EVT InSVT = InVT.getScalarType();
35779 // Input type must be a vector and we must be extending legal integer types.
35780 if (!VT.isVector())
35782 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
35784 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
35787 // On AVX2+ targets, if the input/output types are both legal then we will be
35788 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
35789 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
35790 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
35795 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
35796 EVT InVT = N.getValueType();
35797 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
35798 Size / InVT.getScalarSizeInBits());
35799 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
35800 DAG.getUNDEF(InVT));
35802 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
35805 // If target-size is less than 128-bits, extend to a type that would extend
35806 // to 128 bits, extend that and extract the original target vector.
35807 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
35808 unsigned Scale = 128 / VT.getSizeInBits();
35810 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
35811 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
35812 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
35813 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
35814 DAG.getIntPtrConstant(0, DL));
35817 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
35818 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
35819 // Also use this if we don't have SSE41 to allow the legalizer do its job.
35820 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
35821 (VT.is256BitVector() && Subtarget.hasInt256()) ||
35822 (VT.is512BitVector() && Subtarget.hasAVX512())) {
35823 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
35824 return Opcode == ISD::SIGN_EXTEND
35825 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
35826 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
35829 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
35830 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
35831 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
35832 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
35833 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
35835 SmallVector<SDValue, 8> Opnds;
35836 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
35837 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
35838 DAG.getIntPtrConstant(Offset, DL));
35839 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
35840 SrcVec = Opcode == ISD::SIGN_EXTEND
35841 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
35842 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
35843 Opnds.push_back(SrcVec);
35845 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
35848 // On pre-AVX2 targets, split into 128-bit nodes of
35849 // ISD::*_EXTEND_VECTOR_INREG.
35850 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
35851 return SplitAndExtendInReg(128);
35853 // On pre-AVX512 targets, split into 256-bit nodes of
35854 // ISD::*_EXTEND_VECTOR_INREG.
35855 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
35856 return SplitAndExtendInReg(256);
35861 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
35862 TargetLowering::DAGCombinerInfo &DCI,
35863 const X86Subtarget &Subtarget) {
35864 SDValue N0 = N->getOperand(0);
35865 EVT VT = N->getValueType(0);
35866 EVT InVT = N0.getValueType();
35869 if (SDValue DivRem8 = getDivRem8(N, DAG))
35872 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
35875 if (!DCI.isBeforeLegalizeOps()) {
35876 if (InVT == MVT::i1) {
35877 SDValue Zero = DAG.getConstant(0, DL, VT);
35878 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
35879 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
35884 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
35885 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
35886 // Invert and sign-extend a boolean is the same as zero-extend and subtract
35887 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
35888 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
35889 // sext (xor Bool, -1) --> sub (zext Bool), 1
35890 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
35891 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
35894 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
35897 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
35900 if (Subtarget.hasAVX() && VT.is256BitVector())
35901 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
35904 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
35910 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
35911 const X86Subtarget &Subtarget) {
35912 // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
35914 EVT VT = N->getValueType(0);
35916 // Let legalize expand this if it isn't a legal type yet.
35917 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
35920 EVT ScalarVT = VT.getScalarType();
35921 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
35924 SDValue A = N->getOperand(0);
35925 SDValue B = N->getOperand(1);
35926 SDValue C = N->getOperand(2);
35928 auto invertIfNegative = [](SDValue &V) {
35929 if (SDValue NegVal = isFNEG(V.getNode())) {
35936 // Do not convert the passthru input of scalar intrinsics.
35937 // FIXME: We could allow negations of the lower element only.
35938 bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
35939 N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
35940 bool NegB = invertIfNegative(B);
35941 bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
35942 N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
35944 // Negative multiplication when NegA xor NegB
35945 bool NegMul = (NegA != NegB);
35946 bool HasNeg = NegA || NegB || NegC;
35948 unsigned NewOpcode;
35950 NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
35952 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
35954 // For FMA, we risk reconstructing the node we started with.
35955 // In order to avoid this, we check for negation or opcode change. If
35956 // one of the two happened, then it is a new node and we return it.
35957 if (N->getOpcode() == ISD::FMA) {
35958 if (HasNeg || NewOpcode != N->getOpcode())
35959 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
35963 if (N->getOpcode() == X86ISD::FMADD_RND) {
35964 switch (NewOpcode) {
35965 case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
35966 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
35967 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
35968 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
35970 } else if (N->getOpcode() == X86ISD::FMADDS1) {
35971 switch (NewOpcode) {
35972 case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
35973 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
35974 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
35975 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
35977 } else if (N->getOpcode() == X86ISD::FMADDS3) {
35978 switch (NewOpcode) {
35979 case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
35980 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
35981 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
35982 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
35984 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
35985 switch (NewOpcode) {
35986 case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
35987 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
35988 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
35989 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
35991 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
35992 switch (NewOpcode) {
35993 case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
35994 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
35995 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
35996 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
35998 } else if (N->getOpcode() == X86ISD::FMADD4S) {
35999 switch (NewOpcode) {
36000 case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
36001 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
36002 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
36003 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
36006 llvm_unreachable("Unexpected opcode!");
36009 // Only return the node is the opcode was changed or one of the
36010 // operand was negated. If not, we'll just recreate the same node.
36011 if (HasNeg || NewOpcode != N->getOpcode()) {
36012 if (N->getNumOperands() == 4)
36013 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
36014 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
36020 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
36021 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
36022 const X86Subtarget &Subtarget) {
36024 EVT VT = N->getValueType(0);
36026 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
36030 unsigned NewOpcode;
36031 switch (N->getOpcode()) {
36032 default: llvm_unreachable("Unexpected opcode!");
36033 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
36034 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
36035 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
36036 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
36039 if (N->getNumOperands() == 4)
36040 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
36041 NegVal, N->getOperand(3));
36042 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
36046 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
36047 TargetLowering::DAGCombinerInfo &DCI,
36048 const X86Subtarget &Subtarget) {
36049 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
36050 // (and (i32 x86isd::setcc_carry), 1)
36051 // This eliminates the zext. This transformation is necessary because
36052 // ISD::SETCC is always legalized to i8.
36054 SDValue N0 = N->getOperand(0);
36055 EVT VT = N->getValueType(0);
36057 if (N0.getOpcode() == ISD::AND &&
36059 N0.getOperand(0).hasOneUse()) {
36060 SDValue N00 = N0.getOperand(0);
36061 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36062 if (!isOneConstant(N0.getOperand(1)))
36064 return DAG.getNode(ISD::AND, dl, VT,
36065 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36066 N00.getOperand(0), N00.getOperand(1)),
36067 DAG.getConstant(1, dl, VT));
36071 if (N0.getOpcode() == ISD::TRUNCATE &&
36073 N0.getOperand(0).hasOneUse()) {
36074 SDValue N00 = N0.getOperand(0);
36075 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36076 return DAG.getNode(ISD::AND, dl, VT,
36077 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36078 N00.getOperand(0), N00.getOperand(1)),
36079 DAG.getConstant(1, dl, VT));
36083 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36086 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36089 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36092 if (VT.is256BitVector())
36093 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
36096 if (SDValue DivRem8 = getDivRem8(N, DAG))
36099 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36102 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
36108 /// Try to map a 128-bit or larger integer comparison to vector instructions
36109 /// before type legalization splits it up into chunks.
36110 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
36111 const X86Subtarget &Subtarget) {
36112 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
36113 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
36115 // We're looking for an oversized integer equality comparison, but ignore a
36116 // comparison with zero because that gets special treatment in EmitTest().
36117 SDValue X = SetCC->getOperand(0);
36118 SDValue Y = SetCC->getOperand(1);
36119 EVT OpVT = X.getValueType();
36120 unsigned OpSize = OpVT.getSizeInBits();
36121 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
36124 // Bail out if we know that this is not really just an oversized integer.
36125 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
36126 peekThroughBitcasts(Y).getValueType() == MVT::f128)
36129 // TODO: Use PXOR + PTEST for SSE4.1 or later?
36130 // TODO: Add support for AVX-512.
36131 EVT VT = SetCC->getValueType(0);
36133 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
36134 (OpSize == 256 && Subtarget.hasAVX2())) {
36135 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
36136 SDValue VecX = DAG.getBitcast(VecVT, X);
36137 SDValue VecY = DAG.getBitcast(VecVT, Y);
36139 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
36140 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
36141 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
36142 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
36143 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
36144 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
36145 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
36146 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
36148 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
36154 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
36155 const X86Subtarget &Subtarget) {
36156 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
36157 SDValue LHS = N->getOperand(0);
36158 SDValue RHS = N->getOperand(1);
36159 EVT VT = N->getValueType(0);
36162 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
36163 EVT OpVT = LHS.getValueType();
36164 // 0-x == y --> x+y == 0
36165 // 0-x != y --> x+y != 0
36166 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
36168 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
36169 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36171 // x == 0-y --> x+y == 0
36172 // x != 0-y --> x+y != 0
36173 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
36175 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
36176 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36179 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
36183 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
36184 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
36185 // Put build_vectors on the right.
36186 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
36187 std::swap(LHS, RHS);
36188 CC = ISD::getSetCCSwappedOperands(CC);
36192 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
36193 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
36194 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
36196 if (IsSEXT0 && IsVZero1) {
36197 assert(VT == LHS.getOperand(0).getValueType() &&
36198 "Uexpected operand type");
36199 if (CC == ISD::SETGT)
36200 return DAG.getConstant(0, DL, VT);
36201 if (CC == ISD::SETLE)
36202 return DAG.getConstant(1, DL, VT);
36203 if (CC == ISD::SETEQ || CC == ISD::SETGE)
36204 return DAG.getNOT(DL, LHS.getOperand(0), VT);
36206 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
36207 "Unexpected condition code!");
36208 return LHS.getOperand(0);
36212 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
36213 // to avoid scalarization via legalization because v4i32 is not a legal type.
36214 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
36215 LHS.getValueType() == MVT::v4f32)
36216 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
36221 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
36222 TargetLowering::DAGCombinerInfo &DCI) {
36223 SDValue Src = N->getOperand(0);
36224 MVT SrcVT = Src.getSimpleValueType();
36226 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36227 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36228 !DCI.isBeforeLegalizeOps());
36230 // MOVMSK only uses the MSB from each vector element.
36232 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
36233 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
36234 DCI.AddToWorklist(Src.getNode());
36235 DCI.CommitTargetLoweringOpt(TLO);
36236 return SDValue(N, 0);
36242 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
36243 TargetLowering::DAGCombinerInfo &DCI,
36244 const X86Subtarget &Subtarget) {
36247 // Pre-shrink oversized index elements to avoid triggering scalarization.
36248 if (DCI.isBeforeLegalize()) {
36249 SDValue Index = N->getOperand(4);
36250 if (Index.getScalarValueSizeInBits() > 64) {
36251 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64,
36252 Index.getValueType().getVectorNumElements());
36253 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
36254 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36256 DAG.UpdateNodeOperands(N, NewOps);
36257 DCI.AddToWorklist(N);
36258 return SDValue(N, 0);
36262 // Try to remove sign extends from i32 to i64 on the index.
36263 // Only do this before legalize in case we are relying on it for
36265 // TODO: We should maybe remove any sign extend once we learn how to sign
36266 // extend narrow index during lowering.
36267 if (DCI.isBeforeLegalizeOps()) {
36268 SDValue Index = N->getOperand(4);
36269 if (Index.getScalarValueSizeInBits() == 64 &&
36270 Index.getOpcode() == ISD::SIGN_EXTEND &&
36271 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
36272 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36273 NewOps[4] = Index.getOperand(0);
36274 DAG.UpdateNodeOperands(N, NewOps);
36275 // The original sign extend has less users, add back to worklist in case
36276 // it needs to be removed.
36277 DCI.AddToWorklist(Index.getNode());
36278 DCI.AddToWorklist(N);
36279 return SDValue(N, 0);
36283 // Gather and Scatter instructions use k-registers for masks. The type of
36284 // the masks is v*i1. So the mask will be truncated anyway.
36285 // The SIGN_EXTEND_INREG my be dropped.
36286 SDValue Mask = N->getOperand(2);
36287 if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
36288 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36289 NewOps[2] = Mask.getOperand(0);
36290 DAG.UpdateNodeOperands(N, NewOps);
36293 // With AVX2 we only demand the upper bit of the mask.
36294 if (!Subtarget.hasAVX512()) {
36295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36296 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36297 !DCI.isBeforeLegalizeOps());
36299 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
36300 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
36301 DCI.AddToWorklist(Mask.getNode());
36302 DCI.CommitTargetLoweringOpt(TLO);
36303 return SDValue(N, 0);
36310 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
36311 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
36312 const X86Subtarget &Subtarget) {
36314 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
36315 SDValue EFLAGS = N->getOperand(1);
36317 // Try to simplify the EFLAGS and condition code operands.
36318 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
36319 return getSETCC(CC, Flags, DL, DAG);
36324 /// Optimize branch condition evaluation.
36325 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
36326 const X86Subtarget &Subtarget) {
36328 SDValue EFLAGS = N->getOperand(3);
36329 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
36331 // Try to simplify the EFLAGS and condition code operands.
36332 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
36333 // RAUW them under us.
36334 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
36335 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
36336 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
36337 N->getOperand(1), Cond, Flags);
36343 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
36344 SelectionDAG &DAG) {
36345 // Take advantage of vector comparisons producing 0 or -1 in each lane to
36346 // optimize away operation when it's from a constant.
36348 // The general transformation is:
36349 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
36350 // AND(VECTOR_CMP(x,y), constant2)
36351 // constant2 = UNARYOP(constant)
36353 // Early exit if this isn't a vector operation, the operand of the
36354 // unary operation isn't a bitwise AND, or if the sizes of the operations
36355 // aren't the same.
36356 EVT VT = N->getValueType(0);
36357 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
36358 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
36359 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
36362 // Now check that the other operand of the AND is a constant. We could
36363 // make the transformation for non-constant splats as well, but it's unclear
36364 // that would be a benefit as it would not eliminate any operations, just
36365 // perform one more step in scalar code before moving to the vector unit.
36366 if (BuildVectorSDNode *BV =
36367 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
36368 // Bail out if the vector isn't a constant.
36369 if (!BV->isConstant())
36372 // Everything checks out. Build up the new and improved node.
36374 EVT IntVT = BV->getValueType(0);
36375 // Create a new constant of the appropriate type for the transformed
36377 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
36378 // The AND node needs bitcasts to/from an integer vector type around it.
36379 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
36380 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
36381 N->getOperand(0)->getOperand(0), MaskConst);
36382 SDValue Res = DAG.getBitcast(VT, NewAnd);
36389 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
36390 const X86Subtarget &Subtarget) {
36391 SDValue Op0 = N->getOperand(0);
36392 EVT VT = N->getValueType(0);
36393 EVT InVT = Op0.getValueType();
36394 EVT InSVT = InVT.getScalarType();
36396 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
36397 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
36398 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
36400 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36401 InVT.getVectorNumElements());
36402 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
36404 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
36405 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36408 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
36409 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
36410 // the optimization here.
36411 if (DAG.SignBitIsZero(Op0))
36412 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
36417 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
36418 const X86Subtarget &Subtarget) {
36419 // First try to optimize away the conversion entirely when it's
36420 // conditionally from a constant. Vectors only.
36421 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
36424 // Now move on to more general possibilities.
36425 SDValue Op0 = N->getOperand(0);
36426 EVT VT = N->getValueType(0);
36427 EVT InVT = Op0.getValueType();
36428 EVT InSVT = InVT.getScalarType();
36430 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
36431 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
36432 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
36433 if (InVT.isVector() &&
36434 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
36435 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
36437 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36438 InVT.getVectorNumElements());
36439 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
36440 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36443 // Without AVX512DQ we only support i64 to float scalar conversion. For both
36444 // vectors and scalars, see if we know that the upper bits are all the sign
36445 // bit, in which case we can truncate the input to i32 and convert from that.
36446 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
36447 unsigned BitWidth = InVT.getScalarSizeInBits();
36448 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
36449 if (NumSignBits >= (BitWidth - 31)) {
36450 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
36451 if (InVT.isVector())
36452 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
36453 InVT.getVectorNumElements());
36455 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
36456 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
36460 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
36461 // a 32-bit target where SSE doesn't support i64->FP operations.
36462 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
36463 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
36464 EVT LdVT = Ld->getValueType(0);
36466 // This transformation is not supported if the result type is f16 or f128.
36467 if (VT == MVT::f16 || VT == MVT::f128)
36470 if (!Ld->isVolatile() && !VT.isVector() &&
36471 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
36472 !Subtarget.is64Bit() && LdVT == MVT::i64) {
36473 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
36474 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
36475 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
36482 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
36483 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36484 MVT VT = N->getSimpleValueType(0);
36485 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36486 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
36487 N->getOperand(0), N->getOperand(1),
36494 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
36495 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
36496 TargetLowering::DAGCombinerInfo &DCI) {
36497 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
36498 // the result is either zero or one (depending on the input carry bit).
36499 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
36500 if (X86::isZeroNode(N->getOperand(0)) &&
36501 X86::isZeroNode(N->getOperand(1)) &&
36502 // We don't have a good way to replace an EFLAGS use, so only do this when
36504 SDValue(N, 1).use_empty()) {
36506 EVT VT = N->getValueType(0);
36507 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
36508 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
36509 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36510 DAG.getConstant(X86::COND_B, DL,
36513 DAG.getConstant(1, DL, VT));
36514 return DCI.CombineTo(N, Res1, CarryOut);
36517 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36518 MVT VT = N->getSimpleValueType(0);
36519 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36520 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
36521 N->getOperand(0), N->getOperand(1),
36528 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
36529 /// which is more useful than 0/1 in some cases.
36530 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
36532 // "Condition code B" is also known as "the carry flag" (CF).
36533 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
36534 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
36535 MVT VT = N->getSimpleValueType(0);
36537 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
36539 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
36540 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
36543 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
36544 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
36545 /// with CMP+{ADC, SBB}.
36546 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
36547 bool IsSub = N->getOpcode() == ISD::SUB;
36548 SDValue X = N->getOperand(0);
36549 SDValue Y = N->getOperand(1);
36551 // If this is an add, canonicalize a zext operand to the RHS.
36552 // TODO: Incomplete? What if both sides are zexts?
36553 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
36554 Y.getOpcode() != ISD::ZERO_EXTEND)
36557 // Look through a one-use zext.
36558 bool PeekedThroughZext = false;
36559 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
36560 Y = Y.getOperand(0);
36561 PeekedThroughZext = true;
36564 // If this is an add, canonicalize a setcc operand to the RHS.
36565 // TODO: Incomplete? What if both sides are setcc?
36566 // TODO: Should we allow peeking through a zext of the other operand?
36567 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
36568 Y.getOpcode() != X86ISD::SETCC)
36571 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
36575 EVT VT = N->getValueType(0);
36576 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
36578 // If X is -1 or 0, then we have an opportunity to avoid constants required in
36579 // the general case below.
36580 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
36582 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
36583 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
36584 // This is a complicated way to get -1 or 0 from the carry flag:
36585 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36586 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36587 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36588 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36592 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
36593 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
36594 SDValue EFLAGS = Y->getOperand(1);
36595 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
36596 EFLAGS.getValueType().isInteger() &&
36597 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
36598 // Swap the operands of a SUB, and we have the same pattern as above.
36599 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
36600 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
36601 SDValue NewSub = DAG.getNode(
36602 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
36603 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
36604 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
36605 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36606 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36612 if (CC == X86::COND_B) {
36613 // X + SETB Z --> X + (mask SBB Z, Z)
36614 // X - SETB Z --> X - (mask SBB Z, Z)
36615 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
36616 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
36617 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
36618 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
36619 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
36622 if (CC == X86::COND_A) {
36623 SDValue EFLAGS = Y->getOperand(1);
36624 // Try to convert COND_A into COND_B in an attempt to facilitate
36625 // materializing "setb reg".
36627 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
36628 // cannot take an immediate as its first operand.
36630 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
36631 EFLAGS.getValueType().isInteger() &&
36632 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
36633 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
36634 EFLAGS.getNode()->getVTList(),
36635 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
36636 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
36637 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
36638 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
36639 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
36640 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
36644 if (CC != X86::COND_E && CC != X86::COND_NE)
36647 SDValue Cmp = Y.getOperand(1);
36648 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
36649 !X86::isZeroNode(Cmp.getOperand(1)) ||
36650 !Cmp.getOperand(0).getValueType().isInteger())
36653 SDValue Z = Cmp.getOperand(0);
36654 EVT ZVT = Z.getValueType();
36656 // If X is -1 or 0, then we have an opportunity to avoid constants required in
36657 // the general case below.
36659 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
36661 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
36662 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
36663 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
36664 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
36665 SDValue Zero = DAG.getConstant(0, DL, ZVT);
36666 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
36667 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
36668 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36669 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36670 SDValue(Neg.getNode(), 1));
36673 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
36674 // with fake operands:
36675 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
36676 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
36677 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
36678 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
36679 SDValue One = DAG.getConstant(1, DL, ZVT);
36680 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
36681 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36682 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
36686 // (cmp Z, 1) sets the carry flag if Z is 0.
36687 SDValue One = DAG.getConstant(1, DL, ZVT);
36688 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
36690 // Add the flags type for ADC/SBB nodes.
36691 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36693 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
36694 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
36695 if (CC == X86::COND_NE)
36696 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
36697 DAG.getConstant(-1ULL, DL, VT), Cmp1);
36699 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
36700 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
36701 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
36702 DAG.getConstant(0, DL, VT), Cmp1);
36705 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
36706 const X86Subtarget &Subtarget) {
36707 if (!Subtarget.hasSSE2())
36710 SDValue MulOp = N->getOperand(0);
36711 SDValue Phi = N->getOperand(1);
36713 if (MulOp.getOpcode() != ISD::MUL)
36714 std::swap(MulOp, Phi);
36715 if (MulOp.getOpcode() != ISD::MUL)
36719 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
36722 EVT VT = N->getValueType(0);
36724 unsigned RegSize = 128;
36725 if (Subtarget.hasBWI())
36727 else if (Subtarget.hasAVX2())
36729 unsigned VectorSize = VT.getVectorNumElements() * 16;
36730 // If the vector size is less than 128, or greater than the supported RegSize,
36731 // do not use PMADD.
36732 if (VectorSize < 128 || VectorSize > RegSize)
36736 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
36737 VT.getVectorNumElements());
36738 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36739 VT.getVectorNumElements() / 2);
36741 // Shrink the operands of mul.
36742 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
36743 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
36745 // Madd vector size is half of the original vector size
36746 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
36747 // Fill the rest of the output with 0
36748 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
36749 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
36750 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
36753 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
36754 const X86Subtarget &Subtarget) {
36755 if (!Subtarget.hasSSE2())
36759 EVT VT = N->getValueType(0);
36760 SDValue Op0 = N->getOperand(0);
36761 SDValue Op1 = N->getOperand(1);
36763 // TODO: There's nothing special about i32, any integer type above i16 should
36764 // work just as well.
36765 if (!VT.isVector() || !VT.isSimple() ||
36766 !(VT.getVectorElementType() == MVT::i32))
36769 unsigned RegSize = 128;
36770 if (Subtarget.hasBWI())
36772 else if (Subtarget.hasAVX2())
36775 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
36776 // TODO: We should be able to handle larger vectors by splitting them before
36777 // feeding them into several SADs, and then reducing over those.
36778 if (VT.getSizeInBits() / 4 > RegSize)
36781 // We know N is a reduction add, which means one of its operands is a phi.
36782 // To match SAD, we need the other operand to be a vector select.
36783 SDValue SelectOp, Phi;
36784 if (Op0.getOpcode() == ISD::VSELECT) {
36787 } else if (Op1.getOpcode() == ISD::VSELECT) {
36793 // Check whether we have an abs-diff pattern feeding into the select.
36794 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
36797 // SAD pattern detected. Now build a SAD instruction and an addition for
36798 // reduction. Note that the number of elements of the result of SAD is less
36799 // than the number of elements of its input. Therefore, we could only update
36800 // part of elements in the reduction vector.
36801 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
36803 // The output of PSADBW is a vector of i64.
36804 // We need to turn the vector of i64 into a vector of i32.
36805 // If the reduction vector is at least as wide as the psadbw result, just
36806 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
36808 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
36809 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
36810 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
36812 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
36814 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
36815 // Fill the upper elements with zero to match the add width.
36816 SDValue Zero = DAG.getConstant(0, DL, VT);
36817 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
36818 DAG.getIntPtrConstant(0, DL));
36821 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
36824 /// Convert vector increment or decrement to sub/add with an all-ones constant:
36825 /// add X, <1, 1...> --> sub X, <-1, -1...>
36826 /// sub X, <1, 1...> --> add X, <-1, -1...>
36827 /// The all-ones vector constant can be materialized using a pcmpeq instruction
36828 /// that is commonly recognized as an idiom (has no register dependency), so
36829 /// that's better/smaller than loading a splat 1 constant.
36830 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
36831 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
36832 "Unexpected opcode for increment/decrement transform");
36834 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
36835 // out and wait for legalization if we have an unsupported vector length.
36836 EVT VT = N->getValueType(0);
36837 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
36840 SDNode *N1 = N->getOperand(1).getNode();
36842 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
36843 !SplatVal.isOneValue())
36846 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
36847 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
36848 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
36851 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
36852 const X86Subtarget &Subtarget) {
36853 const SDNodeFlags Flags = N->getFlags();
36854 if (Flags.hasVectorReduction()) {
36855 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
36857 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
36860 EVT VT = N->getValueType(0);
36861 SDValue Op0 = N->getOperand(0);
36862 SDValue Op1 = N->getOperand(1);
36864 // Try to synthesize horizontal adds from adds of shuffles.
36865 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
36866 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
36867 isHorizontalBinOp(Op0, Op1, true))
36868 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
36870 if (SDValue V = combineIncDecVector(N, DAG))
36873 return combineAddOrSubToADCOrSBB(N, DAG);
36876 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
36877 const X86Subtarget &Subtarget) {
36878 SDValue Op0 = N->getOperand(0);
36879 SDValue Op1 = N->getOperand(1);
36880 EVT VT = N->getValueType(0);
36882 // PSUBUS is supported, starting from SSE2, but special preprocessing
36883 // for v8i32 requires umin, which appears in SSE41.
36884 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
36885 !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
36886 !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
36887 !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
36888 (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
36889 VT == MVT::v8i64)))
36892 SDValue SubusLHS, SubusRHS;
36893 // Try to find umax(a,b) - b or a - umin(a,b) patterns
36894 // they may be converted to subus(a,b).
36895 // TODO: Need to add IR cannonicialization for this code.
36896 if (Op0.getOpcode() == ISD::UMAX) {
36898 SDValue MaxLHS = Op0.getOperand(0);
36899 SDValue MaxRHS = Op0.getOperand(1);
36902 else if (MaxRHS == Op1)
36906 } else if (Op1.getOpcode() == ISD::UMIN) {
36908 SDValue MinLHS = Op1.getOperand(0);
36909 SDValue MinRHS = Op1.getOperand(1);
36912 else if (MinRHS == Op0)
36919 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
36920 // special preprocessing in some cases.
36921 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
36922 return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
36924 // Special preprocessing case can be only applied
36925 // if the value was zero extended from 16 bit,
36926 // so we require first 16 bits to be zeros for 32 bit
36927 // values, or first 48 bits for 64 bit values.
36929 DAG.computeKnownBits(SubusLHS, Known);
36930 unsigned NumZeros = Known.countMinLeadingZeros();
36931 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
36934 EVT ExtType = SubusLHS.getValueType();
36936 if (VT == MVT::v8i32 || VT == MVT::v8i64)
36937 ShrinkedType = MVT::v8i16;
36939 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
36941 // If SubusLHS is zeroextended - truncate SubusRHS to it's
36942 // size SubusRHS = umin(0xFFF.., SubusRHS).
36943 SDValue SaturationConst =
36944 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
36945 ShrinkedType.getScalarSizeInBits()),
36946 SDLoc(SubusLHS), ExtType);
36947 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
36949 SDValue NewSubusLHS =
36950 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
36951 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
36952 SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
36953 NewSubusLHS, NewSubusRHS);
36954 // Zero extend the result, it may be used somewhere as 32 bit,
36955 // if not zext and following trunc will shrink.
36956 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
36959 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
36960 const X86Subtarget &Subtarget) {
36961 SDValue Op0 = N->getOperand(0);
36962 SDValue Op1 = N->getOperand(1);
36964 // X86 can't encode an immediate LHS of a sub. See if we can push the
36965 // negation into a preceding instruction.
36966 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
36967 // If the RHS of the sub is a XOR with one use and a constant, invert the
36968 // immediate. Then add one to the LHS of the sub so we can turn
36969 // X-Y -> X+~Y+1, saving one register.
36970 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
36971 isa<ConstantSDNode>(Op1.getOperand(1))) {
36972 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
36973 EVT VT = Op0.getValueType();
36974 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
36976 DAG.getConstant(~XorC, SDLoc(Op1), VT));
36977 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
36978 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
36982 // Try to synthesize horizontal subs from subs of shuffles.
36983 EVT VT = N->getValueType(0);
36984 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
36985 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
36986 isHorizontalBinOp(Op0, Op1, false))
36987 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
36989 if (SDValue V = combineIncDecVector(N, DAG))
36992 // Try to create PSUBUS if SUB's argument is max/min
36993 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
36996 return combineAddOrSubToADCOrSBB(N, DAG);
36999 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
37000 TargetLowering::DAGCombinerInfo &DCI,
37001 const X86Subtarget &Subtarget) {
37002 if (DCI.isBeforeLegalize())
37006 unsigned Opcode = N->getOpcode();
37007 MVT VT = N->getSimpleValueType(0);
37008 MVT SVT = VT.getVectorElementType();
37009 unsigned NumElts = VT.getVectorNumElements();
37010 unsigned EltSizeInBits = SVT.getSizeInBits();
37012 SDValue Op = N->getOperand(0);
37013 MVT OpVT = Op.getSimpleValueType();
37014 MVT OpEltVT = OpVT.getVectorElementType();
37015 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
37016 unsigned InputBits = OpEltSizeInBits * NumElts;
37018 // Perform any constant folding.
37019 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
37021 SmallVector<APInt, 64> EltBits;
37022 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
37023 APInt Undefs(NumElts, 0);
37024 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
37026 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
37027 for (unsigned i = 0; i != NumElts; ++i) {
37028 if (UndefElts[i]) {
37032 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
37033 : EltBits[i].sextOrTrunc(EltSizeInBits);
37035 return getConstVector(Vals, Undefs, VT, DAG, DL);
37038 // (vzext (bitcast (vzext (x)) -> (vzext x)
37039 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
37040 SDValue V = peekThroughBitcasts(Op);
37041 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
37042 MVT InnerVT = V.getSimpleValueType();
37043 MVT InnerEltVT = InnerVT.getVectorElementType();
37045 // If the element sizes match exactly, we can just do one larger vzext. This
37046 // is always an exact type match as vzext operates on integer types.
37047 if (OpEltVT == InnerEltVT) {
37048 assert(OpVT == InnerVT && "Types must match for vzext!");
37049 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
37052 // The only other way we can combine them is if only a single element of the
37053 // inner vzext is used in the input to the outer vzext.
37054 if (InnerEltVT.getSizeInBits() < InputBits)
37057 // In this case, the inner vzext is completely dead because we're going to
37058 // only look at bits inside of the low element. Just do the outer vzext on
37059 // a bitcast of the input to the inner.
37060 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
37063 // Check if we can bypass extracting and re-inserting an element of an input
37064 // vector. Essentially:
37065 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
37066 // TODO: Add X86ISD::VSEXT support
37067 if (Opcode == X86ISD::VZEXT &&
37068 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37069 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37070 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
37071 SDValue ExtractedV = V.getOperand(0);
37072 SDValue OrigV = ExtractedV.getOperand(0);
37073 if (isNullConstant(ExtractedV.getOperand(1))) {
37074 MVT OrigVT = OrigV.getSimpleValueType();
37075 // Extract a subvector if necessary...
37076 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
37077 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
37078 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
37079 OrigVT.getVectorNumElements() / Ratio);
37080 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
37081 DAG.getIntPtrConstant(0, DL));
37083 Op = DAG.getBitcast(OpVT, OrigV);
37084 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
37091 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
37092 const X86Subtarget &Subtarget) {
37093 SDValue Op0 = N->getOperand(0);
37094 SDValue Op1 = N->getOperand(1);
37096 MVT VT = N->getSimpleValueType(0);
37099 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
37100 if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
37101 return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
37102 Op0->getOperand(1));
37104 // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
37105 // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
37106 if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
37107 ISD::isBuildVectorAllZeros(Op1.getNode()))
37108 return getZeroVector(VT, Subtarget, DAG, DL);
37113 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
37114 const X86Subtarget &Subtarget) {
37115 MVT VT = N->getSimpleValueType(0);
37118 if (N->getOperand(0) == N->getOperand(1)) {
37119 if (N->getOpcode() == X86ISD::PCMPEQ)
37120 return getOnesVector(VT, DAG, DL);
37121 if (N->getOpcode() == X86ISD::PCMPGT)
37122 return getZeroVector(VT, Subtarget, DAG, DL);
37128 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
37129 TargetLowering::DAGCombinerInfo &DCI,
37130 const X86Subtarget &Subtarget) {
37131 if (DCI.isBeforeLegalizeOps())
37134 MVT OpVT = N->getSimpleValueType(0);
37136 // Early out for mask vectors.
37137 if (OpVT.getVectorElementType() == MVT::i1)
37141 SDValue Vec = N->getOperand(0);
37142 SDValue SubVec = N->getOperand(1);
37144 unsigned IdxVal = N->getConstantOperandVal(2);
37145 MVT SubVecVT = SubVec.getSimpleValueType();
37147 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
37148 // Inserting zeros into zeros is a nop.
37149 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37152 // If we're inserting into a zero vector and then into a larger zero vector,
37153 // just insert into the larger zero vector directly.
37154 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37155 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
37156 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
37157 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
37158 SubVec.getOperand(1),
37159 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
37162 // If we're inserting a bitcast into zeros, rewrite the insert and move the
37163 // bitcast to the other side. This helps with detecting zero extending
37165 // TODO: Is this useful for other indices than 0?
37166 if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
37167 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
37168 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
37169 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
37170 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
37171 DAG.getBitcast(NewVT, Vec),
37172 SubVec.getOperand(0), N->getOperand(2));
37173 return DAG.getBitcast(OpVT, Insert);
37177 // If this is an insert of an extract, combine to a shuffle. Don't do this
37178 // if the insert or extract can be represented with a subregister operation.
37179 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
37180 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
37181 (IdxVal != 0 || !Vec.isUndef())) {
37182 int ExtIdxVal = SubVec.getConstantOperandVal(1);
37183 if (ExtIdxVal != 0) {
37184 int VecNumElts = OpVT.getVectorNumElements();
37185 int SubVecNumElts = SubVecVT.getVectorNumElements();
37186 SmallVector<int, 64> Mask(VecNumElts);
37187 // First create an identity shuffle mask.
37188 for (int i = 0; i != VecNumElts; ++i)
37190 // Now insert the extracted portion.
37191 for (int i = 0; i != SubVecNumElts; ++i)
37192 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
37194 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
37198 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
37200 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37201 // (load16 addr + 16), Elts/2)
37204 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37205 // (load32 addr + 32), Elts/2)
37207 // or a 16-byte or 32-byte broadcast:
37208 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37209 // (load16 addr), Elts/2)
37210 // --> X86SubVBroadcast(load16 addr)
37212 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37213 // (load32 addr), Elts/2)
37214 // --> X86SubVBroadcast(load32 addr)
37215 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
37216 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37217 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
37218 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
37219 if (Idx2 && Idx2->getZExtValue() == 0) {
37220 SDValue SubVec2 = Vec.getOperand(1);
37221 // If needed, look through bitcasts to get to the load.
37222 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
37224 unsigned Alignment = FirstLd->getAlignment();
37225 unsigned AS = FirstLd->getAddressSpace();
37226 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
37227 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
37228 OpVT, AS, Alignment, &Fast) && Fast) {
37229 SDValue Ops[] = {SubVec2, SubVec};
37230 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
37235 // If lower/upper loads are the same and the only users of the load, then
37236 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
37237 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
37238 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
37239 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
37240 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
37242 // If this is subv_broadcast insert into both halves, use a larger
37244 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
37245 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
37246 SubVec.getOperand(0));
37248 // If we're inserting all zeros into the upper half, change this to
37249 // an insert into an all zeros vector. We will match this to a move
37250 // with implicit upper bit zeroing during isel.
37251 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37252 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
37253 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
37254 Vec.getOperand(2));
37256 // If we are inserting into both halves of the vector, the starting
37257 // vector should be undef. If it isn't, make it so. Only do this if the
37258 // the early insert has no other uses.
37259 // TODO: Should this be a generic DAG combine?
37260 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
37261 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
37262 SubVec2, Vec.getOperand(2));
37263 DCI.AddToWorklist(Vec.getNode());
37264 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
37274 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
37275 TargetLowering::DAGCombinerInfo &DCI,
37276 const X86Subtarget &Subtarget) {
37277 if (DCI.isBeforeLegalizeOps())
37280 MVT OpVT = N->getSimpleValueType(0);
37281 SDValue InVec = N->getOperand(0);
37282 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
37284 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
37285 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
37287 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
37288 if (OpVT.getScalarType() == MVT::i1)
37289 return DAG.getConstant(1, SDLoc(N), OpVT);
37290 return getOnesVector(OpVT, DAG, SDLoc(N));
37293 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
37294 return DAG.getBuildVector(
37296 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
37301 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
37302 DAGCombinerInfo &DCI) const {
37303 SelectionDAG &DAG = DCI.DAG;
37304 switch (N->getOpcode()) {
37306 case ISD::EXTRACT_VECTOR_ELT:
37307 case X86ISD::PEXTRW:
37308 case X86ISD::PEXTRB:
37309 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
37310 case ISD::INSERT_SUBVECTOR:
37311 return combineInsertSubvector(N, DAG, DCI, Subtarget);
37312 case ISD::EXTRACT_SUBVECTOR:
37313 return combineExtractSubvector(N, DAG, DCI, Subtarget);
37316 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
37317 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
37318 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
37319 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
37320 case ISD::SUB: return combineSub(N, DAG, Subtarget);
37321 case X86ISD::SBB: return combineSBB(N, DAG);
37322 case X86ISD::ADC: return combineADC(N, DAG, DCI);
37323 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
37326 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
37327 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
37328 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
37329 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
37330 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
37331 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
37332 case ISD::STORE: return combineStore(N, DAG, Subtarget);
37333 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
37334 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
37335 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
37337 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
37338 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
37339 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
37340 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
37341 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
37342 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
37344 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
37346 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
37348 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
37349 case X86ISD::BT: return combineBT(N, DAG, DCI);
37350 case ISD::ANY_EXTEND:
37351 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
37352 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
37353 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
37354 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
37355 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
37356 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
37357 case X86ISD::PACKSS:
37358 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
37359 case X86ISD::VSHLI:
37360 case X86ISD::VSRAI:
37361 case X86ISD::VSRLI:
37362 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
37363 case ISD::SIGN_EXTEND_VECTOR_INREG:
37364 case ISD::ZERO_EXTEND_VECTOR_INREG:
37365 case X86ISD::VSEXT:
37366 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
37367 case X86ISD::PINSRB:
37368 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
37369 case X86ISD::SHUFP: // Handle all target specific shuffles
37370 case X86ISD::INSERTPS:
37371 case X86ISD::EXTRQI:
37372 case X86ISD::INSERTQI:
37373 case X86ISD::PALIGNR:
37374 case X86ISD::VSHLDQ:
37375 case X86ISD::VSRLDQ:
37376 case X86ISD::BLENDI:
37377 case X86ISD::UNPCKH:
37378 case X86ISD::UNPCKL:
37379 case X86ISD::MOVHLPS:
37380 case X86ISD::MOVLHPS:
37381 case X86ISD::PSHUFB:
37382 case X86ISD::PSHUFD:
37383 case X86ISD::PSHUFHW:
37384 case X86ISD::PSHUFLW:
37385 case X86ISD::MOVSHDUP:
37386 case X86ISD::MOVSLDUP:
37387 case X86ISD::MOVDDUP:
37388 case X86ISD::MOVSS:
37389 case X86ISD::MOVSD:
37390 case X86ISD::VBROADCAST:
37391 case X86ISD::VPPERM:
37392 case X86ISD::VPERMI:
37393 case X86ISD::VPERMV:
37394 case X86ISD::VPERMV3:
37395 case X86ISD::VPERMIV3:
37396 case X86ISD::VPERMIL2:
37397 case X86ISD::VPERMILPI:
37398 case X86ISD::VPERMILPV:
37399 case X86ISD::VPERM2X128:
37400 case X86ISD::VZEXT_MOVL:
37401 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
37402 case X86ISD::FMADD_RND:
37403 case X86ISD::FMADDS1_RND:
37404 case X86ISD::FMADDS3_RND:
37405 case X86ISD::FMADDS1:
37406 case X86ISD::FMADDS3:
37407 case X86ISD::FMADD4S:
37408 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
37409 case X86ISD::FMADDSUB_RND:
37410 case X86ISD::FMSUBADD_RND:
37411 case X86ISD::FMADDSUB:
37412 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
37413 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
37414 case X86ISD::MGATHER:
37415 case X86ISD::MSCATTER:
37417 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
37418 case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
37419 case X86ISD::PCMPEQ:
37420 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
37426 /// Return true if the target has native support for the specified value type
37427 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
37428 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
37429 /// some i16 instructions are slow.
37430 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
37431 if (!isTypeLegal(VT))
37433 if (VT != MVT::i16)
37440 case ISD::SIGN_EXTEND:
37441 case ISD::ZERO_EXTEND:
37442 case ISD::ANY_EXTEND:
37455 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
37456 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
37457 /// we don't adjust the stack we clobber the first frame index.
37458 /// See X86InstrInfo::copyPhysReg.
37459 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
37460 const MachineRegisterInfo &MRI = MF.getRegInfo();
37461 return any_of(MRI.reg_instructions(X86::EFLAGS),
37462 [](const MachineInstr &RI) { return RI.isCopy(); });
37465 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
37466 if (hasCopyImplyingStackAdjustment(MF)) {
37467 MachineFrameInfo &MFI = MF.getFrameInfo();
37468 MFI.setHasCopyImplyingStackAdjustment(true);
37471 TargetLoweringBase::finalizeLowering(MF);
37474 /// This method query the target whether it is beneficial for dag combiner to
37475 /// promote the specified node. If true, it should return the desired promotion
37476 /// type by reference.
37477 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
37478 EVT VT = Op.getValueType();
37479 if (VT != MVT::i16)
37482 bool Promote = false;
37483 bool Commute = false;
37484 switch (Op.getOpcode()) {
37486 case ISD::SIGN_EXTEND:
37487 case ISD::ZERO_EXTEND:
37488 case ISD::ANY_EXTEND:
37493 SDValue N0 = Op.getOperand(0);
37494 // Look out for (store (shl (load), x)).
37495 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
37508 SDValue N0 = Op.getOperand(0);
37509 SDValue N1 = Op.getOperand(1);
37510 if (!Commute && MayFoldLoad(N1))
37512 // Avoid disabling potential load folding opportunities.
37513 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
37515 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
37525 bool X86TargetLowering::
37526 isDesirableToCombineBuildVectorToShuffleTruncate(
37527 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
37529 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
37530 "Element count mismatch");
37532 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
37533 "Shuffle Mask expected to be legal");
37535 // For 32-bit elements VPERMD is better than shuffle+truncate.
37536 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
37537 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
37540 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
37546 //===----------------------------------------------------------------------===//
37547 // X86 Inline Assembly Support
37548 //===----------------------------------------------------------------------===//
37550 // Helper to match a string separated by whitespace.
37551 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
37552 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
37554 for (StringRef Piece : Pieces) {
37555 if (!S.startswith(Piece)) // Check if the piece matches.
37558 S = S.substr(Piece.size());
37559 StringRef::size_type Pos = S.find_first_not_of(" \t");
37560 if (Pos == 0) // We matched a prefix.
37569 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
37571 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
37572 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
37573 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
37574 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
37576 if (AsmPieces.size() == 3)
37578 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
37585 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
37586 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
37588 const std::string &AsmStr = IA->getAsmString();
37590 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
37591 if (!Ty || Ty->getBitWidth() % 16 != 0)
37594 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
37595 SmallVector<StringRef, 4> AsmPieces;
37596 SplitString(AsmStr, AsmPieces, ";\n");
37598 switch (AsmPieces.size()) {
37599 default: return false;
37601 // FIXME: this should verify that we are targeting a 486 or better. If not,
37602 // we will turn this bswap into something that will be lowered to logical
37603 // ops instead of emitting the bswap asm. For now, we don't support 486 or
37604 // lower so don't worry about this.
37606 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
37607 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
37608 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
37609 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
37610 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
37611 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
37612 // No need to check constraints, nothing other than the equivalent of
37613 // "=r,0" would be valid here.
37614 return IntrinsicLowering::LowerToByteSwap(CI);
37617 // rorw $$8, ${0:w} --> llvm.bswap.i16
37618 if (CI->getType()->isIntegerTy(16) &&
37619 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37620 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
37621 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
37623 StringRef ConstraintsStr = IA->getConstraintString();
37624 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
37625 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
37626 if (clobbersFlagRegisters(AsmPieces))
37627 return IntrinsicLowering::LowerToByteSwap(CI);
37631 if (CI->getType()->isIntegerTy(32) &&
37632 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37633 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
37634 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
37635 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
37637 StringRef ConstraintsStr = IA->getConstraintString();
37638 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
37639 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
37640 if (clobbersFlagRegisters(AsmPieces))
37641 return IntrinsicLowering::LowerToByteSwap(CI);
37644 if (CI->getType()->isIntegerTy(64)) {
37645 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
37646 if (Constraints.size() >= 2 &&
37647 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
37648 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
37649 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
37650 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
37651 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
37652 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
37653 return IntrinsicLowering::LowerToByteSwap(CI);
37661 /// Given a constraint letter, return the type of constraint for this target.
37662 X86TargetLowering::ConstraintType
37663 X86TargetLowering::getConstraintType(StringRef Constraint) const {
37664 if (Constraint.size() == 1) {
37665 switch (Constraint[0]) {
37677 case 'k': // AVX512 masking registers.
37678 return C_RegisterClass;
37702 else if (Constraint.size() == 2) {
37703 switch (Constraint[0]) {
37707 switch (Constraint[1]) {
37718 return C_RegisterClass;
37722 return TargetLowering::getConstraintType(Constraint);
37725 /// Examine constraint type and operand type and determine a weight value.
37726 /// This object must already have been set up with the operand type
37727 /// and the current alternative constraint selected.
37728 TargetLowering::ConstraintWeight
37729 X86TargetLowering::getSingleConstraintMatchWeight(
37730 AsmOperandInfo &info, const char *constraint) const {
37731 ConstraintWeight weight = CW_Invalid;
37732 Value *CallOperandVal = info.CallOperandVal;
37733 // If we don't have a value, we can't do a match,
37734 // but allow it at the lowest weight.
37735 if (!CallOperandVal)
37737 Type *type = CallOperandVal->getType();
37738 // Look at the constraint type.
37739 switch (*constraint) {
37741 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
37753 if (CallOperandVal->getType()->isIntegerTy())
37754 weight = CW_SpecificReg;
37759 if (type->isFloatingPointTy())
37760 weight = CW_SpecificReg;
37763 if (type->isX86_MMXTy() && Subtarget.hasMMX())
37764 weight = CW_SpecificReg;
37767 unsigned Size = StringRef(constraint).size();
37768 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
37769 char NextChar = Size == 2 ? constraint[1] : 'i';
37772 switch (NextChar) {
37778 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
37779 return CW_SpecificReg;
37781 // Conditional OpMask regs (AVX512)
37783 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
37784 return CW_Register;
37788 if (type->isX86_MMXTy() && Subtarget.hasMMX())
37791 // Any SSE reg when ISA >= SSE2, same as 'Y'
37795 if (!Subtarget.hasSSE2())
37799 // Fall through (handle "Y" constraint).
37803 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
37804 weight = CW_Register;
37807 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
37808 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
37809 weight = CW_Register;
37812 // Enable conditional vector operations using %k<#> registers.
37813 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
37814 weight = CW_Register;
37817 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
37818 if (C->getZExtValue() <= 31)
37819 weight = CW_Constant;
37823 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37824 if (C->getZExtValue() <= 63)
37825 weight = CW_Constant;
37829 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37830 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
37831 weight = CW_Constant;
37835 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37836 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
37837 weight = CW_Constant;
37841 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37842 if (C->getZExtValue() <= 3)
37843 weight = CW_Constant;
37847 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37848 if (C->getZExtValue() <= 0xff)
37849 weight = CW_Constant;
37854 if (isa<ConstantFP>(CallOperandVal)) {
37855 weight = CW_Constant;
37859 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37860 if ((C->getSExtValue() >= -0x80000000LL) &&
37861 (C->getSExtValue() <= 0x7fffffffLL))
37862 weight = CW_Constant;
37866 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37867 if (C->getZExtValue() <= 0xffffffff)
37868 weight = CW_Constant;
37875 /// Try to replace an X constraint, which matches anything, with another that
37876 /// has more specific requirements based on the type of the corresponding
37878 const char *X86TargetLowering::
37879 LowerXConstraint(EVT ConstraintVT) const {
37880 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
37881 // 'f' like normal targets.
37882 if (ConstraintVT.isFloatingPoint()) {
37883 if (Subtarget.hasSSE2())
37885 if (Subtarget.hasSSE1())
37889 return TargetLowering::LowerXConstraint(ConstraintVT);
37892 /// Lower the specified operand into the Ops vector.
37893 /// If it is invalid, don't add anything to Ops.
37894 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
37895 std::string &Constraint,
37896 std::vector<SDValue>&Ops,
37897 SelectionDAG &DAG) const {
37900 // Only support length 1 constraints for now.
37901 if (Constraint.length() > 1) return;
37903 char ConstraintLetter = Constraint[0];
37904 switch (ConstraintLetter) {
37907 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37908 if (C->getZExtValue() <= 31) {
37909 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37910 Op.getValueType());
37916 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37917 if (C->getZExtValue() <= 63) {
37918 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37919 Op.getValueType());
37925 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37926 if (isInt<8>(C->getSExtValue())) {
37927 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37928 Op.getValueType());
37934 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37935 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
37936 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
37937 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
37938 Op.getValueType());
37944 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37945 if (C->getZExtValue() <= 3) {
37946 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37947 Op.getValueType());
37953 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37954 if (C->getZExtValue() <= 255) {
37955 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37956 Op.getValueType());
37962 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37963 if (C->getZExtValue() <= 127) {
37964 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37965 Op.getValueType());
37971 // 32-bit signed value
37972 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37973 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
37974 C->getSExtValue())) {
37975 // Widen to 64 bits here to get it sign extended.
37976 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
37979 // FIXME gcc accepts some relocatable values here too, but only in certain
37980 // memory models; it's complicated.
37985 // 32-bit unsigned value
37986 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37987 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
37988 C->getZExtValue())) {
37989 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37990 Op.getValueType());
37994 // FIXME gcc accepts some relocatable values here too, but only in certain
37995 // memory models; it's complicated.
37999 // Literal immediates are always ok.
38000 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
38001 // Widen to 64 bits here to get it sign extended.
38002 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
38006 // In any sort of PIC mode addresses need to be computed at runtime by
38007 // adding in a register or some sort of table lookup. These can't
38008 // be used as immediates.
38009 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
38012 // If we are in non-pic codegen mode, we allow the address of a global (with
38013 // an optional displacement) to be used with 'i'.
38014 GlobalAddressSDNode *GA = nullptr;
38015 int64_t Offset = 0;
38017 // Match either (GA), (GA+C), (GA+C1+C2), etc.
38019 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
38020 Offset += GA->getOffset();
38022 } else if (Op.getOpcode() == ISD::ADD) {
38023 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
38024 Offset += C->getZExtValue();
38025 Op = Op.getOperand(0);
38028 } else if (Op.getOpcode() == ISD::SUB) {
38029 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
38030 Offset += -C->getZExtValue();
38031 Op = Op.getOperand(0);
38036 // Otherwise, this isn't something we can handle, reject it.
38040 const GlobalValue *GV = GA->getGlobal();
38041 // If we require an extra load to get this address, as in PIC mode, we
38042 // can't accept it.
38043 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
38046 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
38047 GA->getValueType(0), Offset);
38052 if (Result.getNode()) {
38053 Ops.push_back(Result);
38056 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
38059 /// Check if \p RC is a general purpose register class.
38060 /// I.e., GR* or one of their variant.
38061 static bool isGRClass(const TargetRegisterClass &RC) {
38062 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
38063 RC.hasSuperClassEq(&X86::GR16RegClass) ||
38064 RC.hasSuperClassEq(&X86::GR32RegClass) ||
38065 RC.hasSuperClassEq(&X86::GR64RegClass) ||
38066 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
38069 /// Check if \p RC is a vector register class.
38070 /// I.e., FR* / VR* or one of their variant.
38071 static bool isFRClass(const TargetRegisterClass &RC) {
38072 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
38073 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
38074 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
38075 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
38076 RC.hasSuperClassEq(&X86::VR512RegClass);
38079 std::pair<unsigned, const TargetRegisterClass *>
38080 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
38081 StringRef Constraint,
38083 // First, see if this is a constraint that directly corresponds to an LLVM
38085 if (Constraint.size() == 1) {
38086 // GCC Constraint Letters
38087 switch (Constraint[0]) {
38089 // TODO: Slight differences here in allocation order and leaving
38090 // RIP in the class. Do they matter any more here than they do
38091 // in the normal allocation?
38093 if (Subtarget.hasAVX512()) {
38094 // Only supported in AVX512 or later.
38095 switch (VT.SimpleTy) {
38098 return std::make_pair(0U, &X86::VK32RegClass);
38100 return std::make_pair(0U, &X86::VK16RegClass);
38102 return std::make_pair(0U, &X86::VK8RegClass);
38104 return std::make_pair(0U, &X86::VK1RegClass);
38106 return std::make_pair(0U, &X86::VK64RegClass);
38110 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
38111 if (Subtarget.is64Bit()) {
38112 if (VT == MVT::i32 || VT == MVT::f32)
38113 return std::make_pair(0U, &X86::GR32RegClass);
38114 if (VT == MVT::i16)
38115 return std::make_pair(0U, &X86::GR16RegClass);
38116 if (VT == MVT::i8 || VT == MVT::i1)
38117 return std::make_pair(0U, &X86::GR8RegClass);
38118 if (VT == MVT::i64 || VT == MVT::f64)
38119 return std::make_pair(0U, &X86::GR64RegClass);
38123 // 32-bit fallthrough
38124 case 'Q': // Q_REGS
38125 if (VT == MVT::i32 || VT == MVT::f32)
38126 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
38127 if (VT == MVT::i16)
38128 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
38129 if (VT == MVT::i8 || VT == MVT::i1)
38130 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
38131 if (VT == MVT::i64)
38132 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
38134 case 'r': // GENERAL_REGS
38135 case 'l': // INDEX_REGS
38136 if (VT == MVT::i8 || VT == MVT::i1)
38137 return std::make_pair(0U, &X86::GR8RegClass);
38138 if (VT == MVT::i16)
38139 return std::make_pair(0U, &X86::GR16RegClass);
38140 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
38141 return std::make_pair(0U, &X86::GR32RegClass);
38142 return std::make_pair(0U, &X86::GR64RegClass);
38143 case 'R': // LEGACY_REGS
38144 if (VT == MVT::i8 || VT == MVT::i1)
38145 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
38146 if (VT == MVT::i16)
38147 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
38148 if (VT == MVT::i32 || !Subtarget.is64Bit())
38149 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
38150 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
38151 case 'f': // FP Stack registers.
38152 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
38153 // value to the correct fpstack register class.
38154 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
38155 return std::make_pair(0U, &X86::RFP32RegClass);
38156 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
38157 return std::make_pair(0U, &X86::RFP64RegClass);
38158 return std::make_pair(0U, &X86::RFP80RegClass);
38159 case 'y': // MMX_REGS if MMX allowed.
38160 if (!Subtarget.hasMMX()) break;
38161 return std::make_pair(0U, &X86::VR64RegClass);
38162 case 'Y': // SSE_REGS if SSE2 allowed
38163 if (!Subtarget.hasSSE2()) break;
38166 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
38167 if (!Subtarget.hasSSE1()) break;
38168 bool VConstraint = (Constraint[0] == 'v');
38170 switch (VT.SimpleTy) {
38172 // Scalar SSE types.
38175 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
38176 return std::make_pair(0U, &X86::FR32XRegClass);
38177 return std::make_pair(0U, &X86::FR32RegClass);
38180 if (VConstraint && Subtarget.hasVLX())
38181 return std::make_pair(0U, &X86::FR64XRegClass);
38182 return std::make_pair(0U, &X86::FR64RegClass);
38183 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38191 if (VConstraint && Subtarget.hasVLX())
38192 return std::make_pair(0U, &X86::VR128XRegClass);
38193 return std::make_pair(0U, &X86::VR128RegClass);
38201 if (VConstraint && Subtarget.hasVLX())
38202 return std::make_pair(0U, &X86::VR256XRegClass);
38203 return std::make_pair(0U, &X86::VR256RegClass);
38208 return std::make_pair(0U, &X86::VR512RegClass);
38212 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
38213 switch (Constraint[1]) {
38219 return getRegForInlineAsmConstraint(TRI, "Y", VT);
38221 if (!Subtarget.hasMMX()) break;
38222 return std::make_pair(0U, &X86::VR64RegClass);
38225 if (!Subtarget.hasSSE1()) break;
38226 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
38228 // This register class doesn't allocate k0 for masked vector operation.
38229 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
38230 switch (VT.SimpleTy) {
38233 return std::make_pair(0U, &X86::VK32WMRegClass);
38235 return std::make_pair(0U, &X86::VK16WMRegClass);
38237 return std::make_pair(0U, &X86::VK8WMRegClass);
38239 return std::make_pair(0U, &X86::VK1WMRegClass);
38241 return std::make_pair(0U, &X86::VK64WMRegClass);
38248 // Use the default implementation in TargetLowering to convert the register
38249 // constraint into a member of a register class.
38250 std::pair<unsigned, const TargetRegisterClass*> Res;
38251 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
38253 // Not found as a standard register?
38255 // Map st(0) -> st(7) -> ST0
38256 if (Constraint.size() == 7 && Constraint[0] == '{' &&
38257 tolower(Constraint[1]) == 's' &&
38258 tolower(Constraint[2]) == 't' &&
38259 Constraint[3] == '(' &&
38260 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
38261 Constraint[5] == ')' &&
38262 Constraint[6] == '}') {
38264 Res.first = X86::FP0+Constraint[4]-'0';
38265 Res.second = &X86::RFP80RegClass;
38269 // GCC allows "st(0)" to be called just plain "st".
38270 if (StringRef("{st}").equals_lower(Constraint)) {
38271 Res.first = X86::FP0;
38272 Res.second = &X86::RFP80RegClass;
38277 if (StringRef("{flags}").equals_lower(Constraint)) {
38278 Res.first = X86::EFLAGS;
38279 Res.second = &X86::CCRRegClass;
38283 // 'A' means [ER]AX + [ER]DX.
38284 if (Constraint == "A") {
38285 if (Subtarget.is64Bit()) {
38286 Res.first = X86::RAX;
38287 Res.second = &X86::GR64_ADRegClass;
38289 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
38290 "Expecting 64, 32 or 16 bit subtarget");
38291 Res.first = X86::EAX;
38292 Res.second = &X86::GR32_ADRegClass;
38299 // Otherwise, check to see if this is a register class of the wrong value
38300 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
38301 // turn into {ax},{dx}.
38302 // MVT::Other is used to specify clobber names.
38303 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
38304 return Res; // Correct type already, nothing to do.
38306 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
38307 // return "eax". This should even work for things like getting 64bit integer
38308 // registers when given an f64 type.
38309 const TargetRegisterClass *Class = Res.second;
38310 // The generic code will match the first register class that contains the
38311 // given register. Thus, based on the ordering of the tablegened file,
38312 // the "plain" GR classes might not come first.
38313 // Therefore, use a helper method.
38314 if (isGRClass(*Class)) {
38315 unsigned Size = VT.getSizeInBits();
38316 if (Size == 1) Size = 8;
38317 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
38319 bool is64Bit = Subtarget.is64Bit();
38320 const TargetRegisterClass *RC =
38321 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
38322 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
38323 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
38324 : &X86::GR64RegClass;
38325 if (RC->contains(DestReg))
38326 Res = std::make_pair(DestReg, RC);
38328 // No register found/type mismatch.
38330 Res.second = nullptr;
38332 } else if (isFRClass(*Class)) {
38333 // Handle references to XMM physical registers that got mapped into the
38334 // wrong class. This can happen with constraints like {xmm0} where the
38335 // target independent register mapper will just pick the first match it can
38336 // find, ignoring the required type.
38338 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38339 if (VT == MVT::f32 || VT == MVT::i32)
38340 Res.second = &X86::FR32RegClass;
38341 else if (VT == MVT::f64 || VT == MVT::i64)
38342 Res.second = &X86::FR64RegClass;
38343 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
38344 Res.second = &X86::VR128RegClass;
38345 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
38346 Res.second = &X86::VR256RegClass;
38347 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
38348 Res.second = &X86::VR512RegClass;
38350 // Type mismatch and not a clobber: Return an error;
38352 Res.second = nullptr;
38359 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
38360 const AddrMode &AM, Type *Ty,
38361 unsigned AS) const {
38362 // Scaling factors are not free at all.
38363 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
38364 // will take 2 allocations in the out of order engine instead of 1
38365 // for plain addressing mode, i.e. inst (reg1).
38367 // vaddps (%rsi,%drx), %ymm0, %ymm1
38368 // Requires two allocations (one for the load, one for the computation)
38370 // vaddps (%rsi), %ymm0, %ymm1
38371 // Requires just 1 allocation, i.e., freeing allocations for other operations
38372 // and having less micro operations to execute.
38374 // For some X86 architectures, this is even worse because for instance for
38375 // stores, the complex addressing mode forces the instruction to use the
38376 // "load" ports instead of the dedicated "store" port.
38377 // E.g., on Haswell:
38378 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
38379 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
38380 if (isLegalAddressingMode(DL, AM, Ty, AS))
38381 // Scale represents reg2 * scale, thus account for 1
38382 // as soon as we use a second register.
38383 return AM.Scale != 0;
38387 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
38388 // Integer division on x86 is expensive. However, when aggressively optimizing
38389 // for code size, we prefer to use a div instruction, as it is usually smaller
38390 // than the alternative sequence.
38391 // The exception to this is vector division. Since x86 doesn't have vector
38392 // integer division, leaving the division as-is is a loss even in terms of
38393 // size, because it will have to be scalarized, while the alternative code
38394 // sequence can be performed in vector form.
38396 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
38397 return OptSize && !VT.isVector();
38400 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
38401 if (!Subtarget.is64Bit())
38404 // Update IsSplitCSR in X86MachineFunctionInfo.
38405 X86MachineFunctionInfo *AFI =
38406 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
38407 AFI->setIsSplitCSR(true);
38410 void X86TargetLowering::insertCopiesSplitCSR(
38411 MachineBasicBlock *Entry,
38412 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
38413 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38414 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
38418 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38419 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
38420 MachineBasicBlock::iterator MBBI = Entry->begin();
38421 for (const MCPhysReg *I = IStart; *I; ++I) {
38422 const TargetRegisterClass *RC = nullptr;
38423 if (X86::GR64RegClass.contains(*I))
38424 RC = &X86::GR64RegClass;
38426 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
38428 unsigned NewVR = MRI->createVirtualRegister(RC);
38429 // Create copy from CSR to a virtual register.
38430 // FIXME: this currently does not emit CFI pseudo-instructions, it works
38431 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
38432 // nounwind. If we want to generalize this later, we may need to emit
38433 // CFI pseudo-instructions.
38434 assert(Entry->getParent()->getFunction().hasFnAttribute(
38435 Attribute::NoUnwind) &&
38436 "Function should be nounwind in insertCopiesSplitCSR!");
38437 Entry->addLiveIn(*I);
38438 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
38441 // Insert the copy-back instructions right before the terminator.
38442 for (auto *Exit : Exits)
38443 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
38444 TII->get(TargetOpcode::COPY), *I)
38449 bool X86TargetLowering::supportSwiftError() const {
38450 return Subtarget.is64Bit();
38453 /// Returns the name of the symbol used to emit stack probes or the empty
38454 /// string if not applicable.
38455 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
38456 // If the function specifically requests stack probes, emit them.
38457 if (MF.getFunction().hasFnAttribute("probe-stack"))
38458 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
38460 // Generally, if we aren't on Windows, the platform ABI does not include
38461 // support for stack probes, so don't emit them.
38462 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
38465 // We need a stack probe to conform to the Windows ABI. Choose the right
38467 if (Subtarget.is64Bit())
38468 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
38469 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";